list.of.packages <- c("tidyverse", "reshape2", "plotly", "vcfR", "SNPRelate", "network", "janitor", "vcfR", "corrplot", "PerformanceAnalytics", "ggpubr") #add new libraries here
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
# Load all libraries
lapply(list.of.packages, FUN = function(X) {
do.call("require", list(X))
})
load(file="../raw-data/sample.info")
sample.info.adult <- sample.info %>%
select(sample, stage, population, pCO2.parent, temp.parent) %>% filter(stage=="adult") %>%
mutate(pop_code = as.factor(paste(population, pCO2.parent, sep="-"))) %>% droplevels()
sample.info.larvae <- sample.info %>%
select(sample, stage, population, pCO2.parent) %>% filter(stage=="larvae" & sample!=571) %>%
mutate(pop_code = as.factor(paste(population, pCO2.parent, sep="-")),
sample=str_pad(sample, width=3, side="left", pad="0")) %>% droplevels()
sample.info.juvenile <- sample.info %>%
select(sample, stage, population, pCO2.parent) %>% filter(stage=="juvenile") %>%
mutate(pop_code = as.factor(paste(population, pCO2.parent, sep="-")),
sample=str_pad(sample, width=3, side="left", pad="0")) %>% droplevels()
/Applications/bioinformatics/gatk-4.1.9.0/gatk VariantFiltration \
-R ../references/Olurida_v081_concat_rehead.fa \
-V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult.vcf.gz \
-O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered.vcf.gz \
--filter-name "FS" \
--filter "FS > 60.0" \
--filter-name "QUAL30" \
--filter "QUAL < 30.0" \
--filter-name "SOR3" \
--filter "SOR > 3.0" \
--filter-name "DP50" \
--filter "DP < 50" \
--filter-name "DP450" \
--filter "DP > 450"
## Using GATK jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar
## Running:
## java -Dsamjdk.use_async_io_read_samtools=false -Dsamjdk.use_async_io_write_samtools=true -Dsamjdk.use_async_io_write_tribble=false -Dsamjdk.compression_level=2 -jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar VariantFiltration -R ../references/Olurida_v081_concat_rehead.fa -V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult.vcf.gz -O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered.vcf.gz --filter-name FS --filter FS > 60.0 --filter-name QUAL30 --filter QUAL < 30.0 --filter-name SOR3 --filter SOR > 3.0 --filter-name DP50 --filter DP < 50 --filter-name DP450 --filter DP > 450
## 11:14:09.344 INFO NativeLibraryLoader - Loading libgkl_compression.dylib from jar:file:/Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar!/com/intel/gkl/native/libgkl_compression.dylib
## Apr 21, 2021 11:14:09 AM shaded.cloud_nio.com.google.auth.oauth2.ComputeEngineCredentials runningOnComputeEngine
## INFO: Failed to detect whether we are running on Google Compute Engine.
## 11:14:09.651 INFO VariantFiltration - ------------------------------------------------------------
## 11:14:09.651 INFO VariantFiltration - The Genome Analysis Toolkit (GATK) v4.1.9.0
## 11:14:09.651 INFO VariantFiltration - For support and documentation go to https://software.broadinstitute.org/gatk/
## 11:14:09.651 INFO VariantFiltration - Executing as laura@lauras-MacBook-Pro-2.local on Mac OS X v10.15.7 x86_64
## 11:14:09.651 INFO VariantFiltration - Java runtime: Java HotSpot(TM) 64-Bit Server VM v11.0.1+13-LTS
## 11:14:09.651 INFO VariantFiltration - Start Date/Time: April 21, 2021 at 11:14:09 AM PDT
## 11:14:09.651 INFO VariantFiltration - ------------------------------------------------------------
## 11:14:09.651 INFO VariantFiltration - ------------------------------------------------------------
## 11:14:09.652 INFO VariantFiltration - HTSJDK Version: 2.23.0
## 11:14:09.652 INFO VariantFiltration - Picard Version: 2.23.3
## 11:14:09.652 INFO VariantFiltration - HTSJDK Defaults.COMPRESSION_LEVEL : 2
## 11:14:09.652 INFO VariantFiltration - HTSJDK Defaults.USE_ASYNC_IO_READ_FOR_SAMTOOLS : false
## 11:14:09.652 INFO VariantFiltration - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_SAMTOOLS : true
## 11:14:09.652 INFO VariantFiltration - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_TRIBBLE : false
## 11:14:09.652 INFO VariantFiltration - Deflater: IntelDeflater
## 11:14:09.652 INFO VariantFiltration - Inflater: IntelInflater
## 11:14:09.652 INFO VariantFiltration - GCS max retries/reopens: 20
## 11:14:09.652 INFO VariantFiltration - Requester pays: disabled
## 11:14:09.652 INFO VariantFiltration - Initializing engine
## 11:14:09.803 INFO FeatureManager - Using codec VCFCodec to read file file:///Volumes/Bumblebee/O.lurida_QuantSeq-2020/notebooks/../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult.vcf.gz
## 11:14:09.899 INFO VariantFiltration - Done initializing engine
## 11:14:09.979 INFO ProgressMeter - Starting traversal
## 11:14:09.979 INFO ProgressMeter - Current Locus Elapsed Minutes Variants Processed Variants/Minute
## 11:14:19.984 INFO ProgressMeter - Contig41068_47193:9065967 0.2 562000 3370651.7
## 11:14:25.786 INFO ProgressMeter - Contig98527_104152:29794069 0.3 924617 3509870.9
## 11:14:25.786 INFO ProgressMeter - Traversal complete. Processed 924617 total variants in 0.3 minutes.
## 11:14:25.858 INFO VariantFiltration - Shutting down engine
## [April 21, 2021 at 11:14:25 AM PDT] org.broadinstitute.hellbender.tools.walkers.filters.VariantFiltration done. Elapsed time: 0.28 minutes.
## Runtime.totalMemory()=465567744
/Applications/bioinformatics/gatk-4.1.9.0/gatk SelectVariants \
-R ../references/Olurida_v081_concat_rehead.fa \
-V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered.vcf.gz \
--exclude-filtered TRUE \
--select-type-to-include SNP \
-O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered-true.vcf.gz
## Using GATK jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar
## Running:
## java -Dsamjdk.use_async_io_read_samtools=false -Dsamjdk.use_async_io_write_samtools=true -Dsamjdk.use_async_io_write_tribble=false -Dsamjdk.compression_level=2 -jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar SelectVariants -R ../references/Olurida_v081_concat_rehead.fa -V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered.vcf.gz --exclude-filtered TRUE --select-type-to-include SNP -O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered-true.vcf.gz
## 11:14:27.570 INFO NativeLibraryLoader - Loading libgkl_compression.dylib from jar:file:/Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar!/com/intel/gkl/native/libgkl_compression.dylib
## Apr 21, 2021 11:14:27 AM shaded.cloud_nio.com.google.auth.oauth2.ComputeEngineCredentials runningOnComputeEngine
## INFO: Failed to detect whether we are running on Google Compute Engine.
## 11:14:27.737 INFO SelectVariants - ------------------------------------------------------------
## 11:14:27.737 INFO SelectVariants - The Genome Analysis Toolkit (GATK) v4.1.9.0
## 11:14:27.737 INFO SelectVariants - For support and documentation go to https://software.broadinstitute.org/gatk/
## 11:14:27.737 INFO SelectVariants - Executing as laura@lauras-MacBook-Pro-2.local on Mac OS X v10.15.7 x86_64
## 11:14:27.737 INFO SelectVariants - Java runtime: Java HotSpot(TM) 64-Bit Server VM v11.0.1+13-LTS
## 11:14:27.737 INFO SelectVariants - Start Date/Time: April 21, 2021 at 11:14:27 AM PDT
## 11:14:27.737 INFO SelectVariants - ------------------------------------------------------------
## 11:14:27.737 INFO SelectVariants - ------------------------------------------------------------
## 11:14:27.738 INFO SelectVariants - HTSJDK Version: 2.23.0
## 11:14:27.738 INFO SelectVariants - Picard Version: 2.23.3
## 11:14:27.738 INFO SelectVariants - HTSJDK Defaults.COMPRESSION_LEVEL : 2
## 11:14:27.738 INFO SelectVariants - HTSJDK Defaults.USE_ASYNC_IO_READ_FOR_SAMTOOLS : false
## 11:14:27.738 INFO SelectVariants - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_SAMTOOLS : true
## 11:14:27.738 INFO SelectVariants - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_TRIBBLE : false
## 11:14:27.738 INFO SelectVariants - Deflater: IntelDeflater
## 11:14:27.738 INFO SelectVariants - Inflater: IntelInflater
## 11:14:27.738 INFO SelectVariants - GCS max retries/reopens: 20
## 11:14:27.738 INFO SelectVariants - Requester pays: disabled
## 11:14:27.738 INFO SelectVariants - Initializing engine
## 11:14:27.867 INFO FeatureManager - Using codec VCFCodec to read file file:///Volumes/Bumblebee/O.lurida_QuantSeq-2020/notebooks/../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered.vcf.gz
## 11:14:27.960 INFO SelectVariants - Done initializing engine
## 11:14:27.992 INFO ProgressMeter - Starting traversal
## 11:14:27.992 INFO ProgressMeter - Current Locus Elapsed Minutes Variants Processed Variants/Minute
## 11:14:38.028 INFO ProgressMeter - Contig129983_136896:10380799 0.2 189000 1129932.2
## 11:14:48.037 INFO ProgressMeter - Contig34748_41067:22021404 0.3 401000 1200299.3
## 11:14:58.051 INFO ProgressMeter - Contig59084_64831:31492429 0.5 612000 1221638.2
## 11:15:04.426 INFO ProgressMeter - Contig98527_104152:29059358 0.6 747969 1231765.4
## 11:15:04.426 INFO ProgressMeter - Traversal complete. Processed 747969 total variants in 0.6 minutes.
## 11:15:04.477 INFO SelectVariants - Shutting down engine
## [April 21, 2021 at 11:15:04 AM PDT] org.broadinstitute.hellbender.tools.walkers.variantutils.SelectVariants done. Elapsed time: 0.62 minutes.
## Runtime.totalMemory()=465567744
#snpgdsClose(genofile.adult)
vcf.fn.adult <- "../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered-true.vcf.gz"
snpgdsVCF2GDS(vcf.fn.adult, "../qc-processing/gatk/genotypes-adult.gds", method="biallelic.only")
## Start file conversion from VCF to SNP GDS ...
## Method: exacting biallelic SNPs
## Number of samples: 53
## Parsing "../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered-true.vcf.gz" ...
## import 175243 variants.
## + genotype { Bit2 53x175243, 2.2M } *
## Optimize the access efficiency ...
## Clean up the fragments of GDS file:
## open the file '../qc-processing/gatk/genotypes-adult.gds' (3.1M)
## # of fragments: 68
## save to '../qc-processing/gatk/genotypes-adult.gds.tmp'
## rename '../qc-processing/gatk/genotypes-adult.gds.tmp' (3.1M, reduced: 576B)
## # of fragments: 20
snpgdsSummary("../qc-processing/gatk/genotypes-adult.gds")
## The file name: /Volumes/Bumblebee/O.lurida_QuantSeq-2020/qc-processing/gatk/genotypes-adult.gds
## The total number of samples: 53
## The total number of SNPs: 175243
## SNP genotypes are stored in SNP-major mode (Sample X SNP).
(genofile.adult <- snpgdsOpen("../qc-processing/gatk/genotypes-adult.gds"))
## File: /Volumes/Bumblebee/O.lurida_QuantSeq-2020/qc-processing/gatk/genotypes-adult.gds (3.1M)
## + [ ] *
## |--+ sample.id { Str8 53 LZMA_ra(76.4%), 169B }
## |--+ snp.id { Int32 175243 LZMA_ra(6.47%), 44.3K }
## |--+ snp.rs.id { Str8 175243 LZMA_ra(0.10%), 181B }
## |--+ snp.position { Int32 175243 LZMA_ra(40.8%), 279.2K }
## |--+ snp.chromosome { Str8 175243 LZMA_ra(0.03%), 973B }
## |--+ snp.allele { Str8 175243 LZMA_ra(11.9%), 81.7K }
## |--+ genotype { Bit2 53x175243, 2.2M } *
## \--+ snp.annot [ ]
## |--+ qual { Float32 175243 LZMA_ra(65.4%), 448.0K }
## \--+ filter { Str8 175243 LZMA_ra(0.03%), 285B }
snpset.adult <- snpgdsLDpruning(genofile.adult, maf=.1, missing.rate=0, autosome.only=FALSE)
## SNP pruning based on LD:
## Excluding 174,670 SNPs (monomorphic: TRUE, MAF: 0.1, missing rate: 0)
## # of samples: 53
## # of SNPs: 573
## using 1 thread
## sliding window: 500,000 basepairs, Inf SNPs
## |LD| threshold: 0.2
## method: composite
## Chromosome Contig0_5313: 0.23%, 50/21,863
## Chromosome Contig104153_109739: 0.13%, 3/2,360
## Chromosome Contig109741_116186: 0.27%, 5/1,861
## Chromosome Contig11179_21262: 0.29%, 46/15,668
## Chromosome Contig116188_123130: 0.11%, 2/1,794
## Chromosome Contig123131_129982: 0.22%, 4/1,787
## Chromosome Contig129983_136896: 0.22%, 4/1,798
## Chromosome Contig136897_143777: 0.16%, 3/1,887
## Chromosome Contig143780_150710: 0.06%, 1/1,746
## Chromosome Contig150711_166372: 0.40%, 6/1,485
## Chromosome Contig166374_181896: 0.21%, 4/1,875
## Chromosome Contig181898_197428: 0.20%, 3/1,516
## Chromosome Contig197431_213218: 0.06%, 1/1,777
## Chromosome Contig21263_28135: 0.30%, 54/18,170
## Chromosome Contig213221_398012: 0.38%, 7/1,819
## Chromosome Contig28136_34745: 0.26%, 41/15,929
## Chromosome Contig34748_41067: 0.40%, 54/13,582
## Chromosome Contig398123_696856: 0.29%, 5/1,721
## Chromosome Contig41068_47193: 0.25%, 28/11,130
## Chromosome Contig47194_53180: 0.31%, 28/9,167
## Chromosome Contig5314_11178: 0.17%, 9/5,362
## Chromosome Contig53181_59083: 0.22%, 17/7,588
## Chromosome Contig59084_64831: 0.22%, 14/6,364
## Chromosome Contig64832_70524: 0.28%, 16/5,633
## Chromosome Contig70525_76135: 0.28%, 13/4,689
## Chromosome Contig76136_81765: 0.17%, 7/4,040
## Chromosome Contig81766_87352: 0.24%, 10/4,163
## Chromosome Contig87353_92929: 0.29%, 8/2,773
## Chromosome Contig92931_98526: 0.22%, 6/2,712
## Chromosome Contig98527_104152: 0.27%, 8/2,984
## 457 markers are selected in total.
snpset.adult.id <- unlist(unname(snpset.adult))
# PCA
pca.adult <- snpgdsPCA(genofile.adult, snp.id=snpset.adult.id, num.thread=2, autosome.only=FALSE)
## Principal Component Analysis (PCA) on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 53
## # of SNPs: 457
## using 2 threads
## # of principal components: 32
## PCA: the sum of all selected genotypes (0,1,2) = 37378
## CPU capabilities: Double-Precision SSE2
## Wed Apr 21 11:15:11 2021 (internal increment: 19088)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
## Wed Apr 21 11:15:11 2021 Begin (eigenvalues and eigenvectors)
## Wed Apr 21 11:15:11 2021 Done.
pc.percent.adult <- pca.adult$varprop*100
head(round(pc.percent.adult, 2))
## [1] 9.38 6.35 5.61 4.68 4.47 4.03
tab.adult <- data.frame(sample.id = pca.adult$sample.id,
EV1 = pca.adult$eigenvect[,1], # the first eigenvector
EV2 = pca.adult$eigenvect[,2], # the second eigenvector
EV3 = pca.adult$eigenvect[,3], # the third eigenvector
EV4 = pca.adult$eigenvect[,4], # the fourth eigenvector
EV5 = pca.adult$eigenvect[,5], # the fifth eigenvector
stringsAsFactors = FALSE)
tab.adult.annot <- left_join(tab.adult, sample.info.adult, by=c("sample.id"="sample")) %>% droplevels()
group.adult <- as.factor(tab.adult.annot$pop_code)
#ggplotly(
ggplot(tab.adult.annot, aes(x=EV2, y=EV1, col=population, shape=pCO2.parent)) +
geom_point(size=4) + theme_minimal() + ggtitle("Adult Genetic Structure, PC1xPC2")#)
ggplotly(
ggplot(tab.adult.annot, aes(x=EV3, y=EV1, col=population, shape=pCO2.parent)) +
geom_point(size=4) + theme_minimal() + ggtitle("Adult Genetic Structure, PC1xPC3"))
ggplotly(
ggplot(tab.adult.annot, aes(x=EV4, y=EV1, col=population, shape=pCO2.parent)) +
geom_point(size=4) + theme_minimal() + ggtitle("Adult Genetic Structure, PC1xPC4"))
ggplotly(
ggplot(tab.adult.annot, aes(x=EV5, y=EV1, col=population, shape=pCO2.parent)) +
geom_point(size=4) + theme_minimal() + ggtitle("Adult Genetic Structure, PC1xPC5"))
save(tab.adult.annot, file="../results/tab.adult.annot")
# To perform cluster analysis on the matrix of genome-wide IBS pairwise distances,
# and determine the groups by a permutation score.
# Result= only 1 group (?)
ibs.hc.adult <- snpgdsHCluster(snpgdsIBS(genofile.adult, snp.id=snpset.adult.id, num.thread=2, autosome.only=FALSE))
## Identity-By-State (IBS) analysis on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 53
## # of SNPs: 457
## using 2 threads
## IBS: the sum of all selected genotypes (0,1,2) = 37378
## Wed Apr 21 11:15:12 2021 (internal increment: 65536)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
## Wed Apr 21 11:15:12 2021 Done.
rv.adult <- snpgdsCutTree(ibs.hc.adult)
## Determine groups by permutation (Z threshold: 15, outlier threshold: 5):
## Create 1 groups.
plot(rv.adult$dendrogram, leaflab="none", main="HapMap Phase II")
# Determine groups of individuals by population information
rv2.adult <- snpgdsCutTree(ibs.hc.adult, samp.group=as.factor(tab.adult.annot$pop_code))
## Create 6 groups.
plot(rv2.adult$dendrogram, leaflab="none", main="HapMap Phase II")
legend("bottom", legend=levels(group.adult), col=1:nlevels(group.adult), pch=19, ncol=4, cex=0.6, pt.cex=1)
AF.adult.FB.high <- snpgdsSNPRateFreq(genofile.adult, snp.id=snpset.adult.id,
sample.id=subset(tab.adult.annot, population=="Fidalgo Bay" & pCO2.parent=="High")$sample.id,
with.id = TRUE)
AF.adult.FB.amb <- snpgdsSNPRateFreq(genofile.adult, snp.id=snpset.adult.id,
sample.id=subset(tab.adult.annot, population=="Fidalgo Bay" & pCO2.parent=="Ambient")$sample.id,
with.id = TRUE)
AF.adult.DB.high <- snpgdsSNPRateFreq(genofile.adult, snp.id=snpset.adult.id,
sample.id=subset(tab.adult.annot, population=="Dabob Bay" & pCO2.parent=="High")$sample.id,
with.id = TRUE)
AF.adult.DB.amb <- snpgdsSNPRateFreq(genofile.adult, snp.id=snpset.adult.id,
sample.id=subset(tab.adult.annot, population=="Dabob Bay" & pCO2.parent=="Ambient")$sample.id,
with.id = TRUE)
AF.adult.OB1.high <- snpgdsSNPRateFreq(genofile.adult, snp.id=snpset.adult.id,
sample.id=subset(tab.adult.annot, population=="Oyster Bay C1" & pCO2.parent=="High")$sample.id,
with.id = TRUE)
AF.adult.OB1.amb <- snpgdsSNPRateFreq(genofile.adult, snp.id=snpset.adult.id,
sample.id=subset(tab.adult.annot, population=="Oyster Bay C1" & pCO2.parent=="Ambient")$sample.id,
with.id = TRUE)
AF.adults <- rbind(do.call(cbind.data.frame, AF.adult.FB.high[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("FB"), stage=as.factor("adult"), pCO2=as.factor("high")),
do.call(cbind.data.frame, AF.adult.FB.amb[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("FB"), stage=as.factor("adult"), pCO2=as.factor("ambient")),
do.call(cbind.data.frame, AF.adult.DB.high[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("DB"), stage=as.factor("adult"), pCO2=as.factor("high")),
do.call(cbind.data.frame, AF.adult.DB.amb[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("DB"), stage=as.factor("adult"), pCO2=as.factor("ambient")),
do.call(cbind.data.frame, AF.adult.OB1.high[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("OB1"), stage=as.factor("adult"), pCO2=as.factor("high")),
do.call(cbind.data.frame, AF.adult.OB1.amb[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("OB1"), stage=as.factor("adult"), pCO2=as.factor("ambient")))
ggplot(AF.adults, aes(x=pCO2, y=AlleleFreq, fill=population, alpha=pCO2)) + geom_violin(trim = F) + facet_wrap(~population) + theme_minimal() +scale_alpha_discrete(range = c(0.35, 0.9)) +
stat_summary(fun.y=mean, geom="point", shape=8, size=4, color="black", fill="black") + xlab("pCO2 exposure") + ylab("Allele Frequency") +
ggtitle(paste("SNP Allele Frequency, Adults by population & pCO2 treatment\n", "(No. loci = ", length(snpset.adult.id), ")", sep="")) +
geom_text(data=AF.adults %>% group_by(population, pCO2) %>% summarize(mean=mean(AlleleFreq)), aes(x=pCO2, y=mean+.1, label=round(mean, 2)))
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## Adult Fst Calculations
# within FB (comparing pCO2 exposure):
Fst.adult.FB <- snpgdsFst(genofile.adult, snp.id=snpset.adult.id,
sample.id=subset(tab.adult.annot, population=="Fidalgo Bay")$sample.id,
population=as.factor(subset(tab.adult.annot, population=="Fidalgo Bay")$pCO2.parent),
method="W&C84", autosome.only=FALSE, with.id = TRUE)
## Fst estimation on genotypes:
## Excluding 19 SNPs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 18
## # of SNPs: 438
## Method: Weir & Cockerham, 1984
## # of Populations: 2
## Ambient (9), High (9)
paste("Weir and Cockerham mean Fst estimate, Fidalgo Bay Ambient vs. High: ", round(Fst.adult.FB$MeanFst, 5))
## [1] "Weir and Cockerham mean Fst estimate, Fidalgo Bay Ambient vs. High: 0.016"
hist(Fst.adult.FB$FstSNP, breaks = 75, main="Fst distribution\nFidalgo Bay Ambient vs. High, Adults", col="dodgerblue3")
# within DB (comparing pCO2 exposure):
Fst.adult.DB <- snpgdsFst(genofile.adult, snp.id=snpset.adult.id,
sample.id=subset(tab.adult.annot, population=="Dabob Bay")$sample.id,
population=as.factor(subset(tab.adult.annot, population=="Dabob Bay")$pCO2.parent),
method="W&C84", autosome.only=FALSE, with.id = TRUE)
## Fst estimation on genotypes:
## Excluding 33 SNPs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 17
## # of SNPs: 424
## Method: Weir & Cockerham, 1984
## # of Populations: 2
## Ambient (9), High (8)
paste("Weir and Cockerham mean Fst estimate, Dabob Bay Ambient vs. High: ", round(Fst.adult.DB$MeanFst, 5))
## [1] "Weir and Cockerham mean Fst estimate, Dabob Bay Ambient vs. High: 0.0153"
hist(Fst.adult.DB$FstSNP, breaks = 100, main="Fst distribution\nDabob Bay Ambient vs. High, Adults", col="darkseagreen3")
# within OB1 (comparing pCO2 exposure):
Fst.adult.OB1 <- snpgdsFst(genofile.adult, snp.id=snpset.adult.id,
sample.id=subset(tab.adult.annot, population=="Oyster Bay C1")$sample.id,
population=as.factor(subset(tab.adult.annot, population=="Oyster Bay C1")$pCO2.parent),
method="W&C84", autosome.only=FALSE, with.id = TRUE)
## Fst estimation on genotypes:
## Excluding 36 SNPs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 18
## # of SNPs: 421
## Method: Weir & Cockerham, 1984
## # of Populations: 2
## Ambient (9), High (9)
paste("Weir and Cockerham mean Fst estimate, Oyster Bay C1 Ambient vs. High: ", round(Fst.adult.OB1$MeanFst, 5))
## [1] "Weir and Cockerham mean Fst estimate, Oyster Bay C1 Ambient vs. High: -0.01317"
hist(Fst.adult.OB1$FstSNP, breaks = 100, main="Fst distribution\nOyster Bay C1 Ambient vs. High, Adults", col="salmon2")
Fst.adults <- rbind(do.call(cbind.data.frame, Fst.adult.FB[c("snp.id", "FstSNP")]) %>% mutate(population=as.factor("FB"), stage=as.factor("adult")),
do.call(cbind.data.frame, Fst.adult.DB[c("snp.id", "FstSNP")]) %>% mutate(population=as.factor("DB"), stage=as.factor("adult")),
do.call(cbind.data.frame, Fst.adult.OB1[c("snp.id", "FstSNP")]) %>% mutate(population=as.factor("OB1"), stage=as.factor("adult")))
ggplot(Fst.adults, aes(x=population, y=FstSNP, fill=population)) + geom_violin(trim = F) +
stat_summary(fun.y=mean, geom="point", shape=8, size=3.5, color="black", fill="black") +
ggtitle(paste("SNP Fst, Adults by pCO2 treatment\n", "(No. loci = ", length(snpset.adult.id), ")", sep="")) +
geom_text(data=Fst.adults %>% group_by(population) %>% summarize(mean=mean(FstSNP)), aes(x=population, y=.65, label=round(mean, 3)))
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Adult Identity-by-state
lbls <- paste("PC", 1:4, "\n", format(pc.percent.adult[1:4], digits=2), "%", sep="")
pairs(pca.adult$eigenvect[,1:4], col=as.color(tab.adult.annot$pCO2.parent), labels=lbls, main="colors = pCO2 only")
pairs(pca.adult$eigenvect[,1:4], col=as.color(tab.adult.annot$pop_code), labels=lbls, main="colors = pop & pCO2")
ibs.adult <- snpgdsIBS(genofile.adult, snp.id=snpset.adult.id, num.thread=2, autosome.only=FALSE)
## Identity-By-State (IBS) analysis on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 53
## # of SNPs: 457
## using 2 threads
## IBS: the sum of all selected genotypes (0,1,2) = 37378
## Wed Apr 21 11:15:14 2021 (internal increment: 65536)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
## Wed Apr 21 11:15:14 2021 Done.
pop.idx.adult <- order(tab.adult.annot$pop_code)
#par(xpd=TRUE)
#heatmap(ibs.adult$ibs, col=terrain.colors(16), labCol=TRUE,
# RowSideColors=c("black","red","green","blue")[tab.adult.annot$pop_code])
#legend(0,-.05, legend=levels(tab.adult.annot$pop_code), pch="o", col=1:nlevels(tab.adult.annot$pop_code),
# text.col=1:nlevels(tab.adult.annot$pop_code), cex=.75, ncol=4)
#Perform multidimensional scaling analysis on the matrix of genome-wide IBS pairwise distances:
loc.adult <- cmdscale(1 - ibs.adult$ibs, k = 2)
x.adult <- loc.adult[, 1]; y.adult <- loc.adult[, 2]
plot(x.adult, y.adult, col=group.adult, xlab = "", ylab = "",
main = "Multidimensional Scaling Analysis (IBS)", cex=2, pch=16)
text(x.adult, y.adult+.01, labels=tab.adult.annot$sample.id, cex=0.7, font=2, col=as.integer(group.adult))
legend("bottom", legend=levels(group.adult), pch="o", text.col=1:nlevels(group.adult))
/Applications/bioinformatics/gatk-4.1.9.0/gatk VariantFiltration \
-R ../references/Olurida_v081_concat_rehead.fa \
-V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae.vcf.gz \
-O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered.vcf.gz \
--filter-name "FS" \
--filter "FS > 60.0" \
--filter-name "QUAL30" \
--filter "QUAL < 30.0" \
--filter-name "SOR3" \
--filter "SOR > 3.0" \
--filter-name "DP50" \
--filter "DP < 50" \
--filter-name "DP450" \
--filter "DP > 450"
## Using GATK jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar
## Running:
## java -Dsamjdk.use_async_io_read_samtools=false -Dsamjdk.use_async_io_write_samtools=true -Dsamjdk.use_async_io_write_tribble=false -Dsamjdk.compression_level=2 -jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar VariantFiltration -R ../references/Olurida_v081_concat_rehead.fa -V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae.vcf.gz -O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered.vcf.gz --filter-name FS --filter FS > 60.0 --filter-name QUAL30 --filter QUAL < 30.0 --filter-name SOR3 --filter SOR > 3.0 --filter-name DP50 --filter DP < 50 --filter-name DP450 --filter DP > 450
## 11:15:15.855 INFO NativeLibraryLoader - Loading libgkl_compression.dylib from jar:file:/Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar!/com/intel/gkl/native/libgkl_compression.dylib
## Apr 21, 2021 11:15:16 AM shaded.cloud_nio.com.google.auth.oauth2.ComputeEngineCredentials runningOnComputeEngine
## INFO: Failed to detect whether we are running on Google Compute Engine.
## 11:15:16.314 INFO VariantFiltration - ------------------------------------------------------------
## 11:15:16.315 INFO VariantFiltration - The Genome Analysis Toolkit (GATK) v4.1.9.0
## 11:15:16.315 INFO VariantFiltration - For support and documentation go to https://software.broadinstitute.org/gatk/
## 11:15:16.315 INFO VariantFiltration - Executing as laura@lauras-MacBook-Pro-2.local on Mac OS X v10.15.7 x86_64
## 11:15:16.315 INFO VariantFiltration - Java runtime: Java HotSpot(TM) 64-Bit Server VM v11.0.1+13-LTS
## 11:15:16.315 INFO VariantFiltration - Start Date/Time: April 21, 2021 at 11:15:15 AM PDT
## 11:15:16.315 INFO VariantFiltration - ------------------------------------------------------------
## 11:15:16.315 INFO VariantFiltration - ------------------------------------------------------------
## 11:15:16.315 INFO VariantFiltration - HTSJDK Version: 2.23.0
## 11:15:16.316 INFO VariantFiltration - Picard Version: 2.23.3
## 11:15:16.316 INFO VariantFiltration - HTSJDK Defaults.COMPRESSION_LEVEL : 2
## 11:15:16.316 INFO VariantFiltration - HTSJDK Defaults.USE_ASYNC_IO_READ_FOR_SAMTOOLS : false
## 11:15:16.316 INFO VariantFiltration - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_SAMTOOLS : true
## 11:15:16.316 INFO VariantFiltration - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_TRIBBLE : false
## 11:15:16.316 INFO VariantFiltration - Deflater: IntelDeflater
## 11:15:16.316 INFO VariantFiltration - Inflater: IntelInflater
## 11:15:16.316 INFO VariantFiltration - GCS max retries/reopens: 20
## 11:15:16.316 INFO VariantFiltration - Requester pays: disabled
## 11:15:16.316 INFO VariantFiltration - Initializing engine
## 11:15:16.455 INFO FeatureManager - Using codec VCFCodec to read file file:///Volumes/Bumblebee/O.lurida_QuantSeq-2020/notebooks/../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae.vcf.gz
## 11:15:16.546 INFO VariantFiltration - Done initializing engine
## 11:15:16.615 INFO ProgressMeter - Starting traversal
## 11:15:16.616 INFO ProgressMeter - Current Locus Elapsed Minutes Variants Processed Variants/Minute
## 11:15:26.636 INFO ProgressMeter - Contig59084_64831:24273576 0.2 456000 2730811.5
## 11:15:28.482 INFO ProgressMeter - Contig98527_104152:30619157 0.2 555097 2806827.9
## 11:15:28.482 INFO ProgressMeter - Traversal complete. Processed 555097 total variants in 0.2 minutes.
## 11:15:28.544 INFO VariantFiltration - Shutting down engine
## [April 21, 2021 at 11:15:28 AM PDT] org.broadinstitute.hellbender.tools.walkers.filters.VariantFiltration done. Elapsed time: 0.21 minutes.
## Runtime.totalMemory()=387973120
/Applications/bioinformatics/gatk-4.1.9.0/gatk SelectVariants \
-R ../references/Olurida_v081_concat_rehead.fa \
-V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered.vcf.gz \
--exclude-filtered TRUE \
--select-type-to-include SNP \
-O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered-true.vcf.gz
## Using GATK jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar
## Running:
## java -Dsamjdk.use_async_io_read_samtools=false -Dsamjdk.use_async_io_write_samtools=true -Dsamjdk.use_async_io_write_tribble=false -Dsamjdk.compression_level=2 -jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar SelectVariants -R ../references/Olurida_v081_concat_rehead.fa -V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered.vcf.gz --exclude-filtered TRUE --select-type-to-include SNP -O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered-true.vcf.gz
## 11:15:30.239 INFO NativeLibraryLoader - Loading libgkl_compression.dylib from jar:file:/Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar!/com/intel/gkl/native/libgkl_compression.dylib
## Apr 21, 2021 11:15:30 AM shaded.cloud_nio.com.google.auth.oauth2.ComputeEngineCredentials runningOnComputeEngine
## INFO: Failed to detect whether we are running on Google Compute Engine.
## 11:15:30.403 INFO SelectVariants - ------------------------------------------------------------
## 11:15:30.403 INFO SelectVariants - The Genome Analysis Toolkit (GATK) v4.1.9.0
## 11:15:30.403 INFO SelectVariants - For support and documentation go to https://software.broadinstitute.org/gatk/
## 11:15:30.403 INFO SelectVariants - Executing as laura@lauras-MacBook-Pro-2.local on Mac OS X v10.15.7 x86_64
## 11:15:30.403 INFO SelectVariants - Java runtime: Java HotSpot(TM) 64-Bit Server VM v11.0.1+13-LTS
## 11:15:30.404 INFO SelectVariants - Start Date/Time: April 21, 2021 at 11:15:30 AM PDT
## 11:15:30.404 INFO SelectVariants - ------------------------------------------------------------
## 11:15:30.404 INFO SelectVariants - ------------------------------------------------------------
## 11:15:30.404 INFO SelectVariants - HTSJDK Version: 2.23.0
## 11:15:30.404 INFO SelectVariants - Picard Version: 2.23.3
## 11:15:30.404 INFO SelectVariants - HTSJDK Defaults.COMPRESSION_LEVEL : 2
## 11:15:30.404 INFO SelectVariants - HTSJDK Defaults.USE_ASYNC_IO_READ_FOR_SAMTOOLS : false
## 11:15:30.404 INFO SelectVariants - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_SAMTOOLS : true
## 11:15:30.404 INFO SelectVariants - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_TRIBBLE : false
## 11:15:30.404 INFO SelectVariants - Deflater: IntelDeflater
## 11:15:30.404 INFO SelectVariants - Inflater: IntelInflater
## 11:15:30.404 INFO SelectVariants - GCS max retries/reopens: 20
## 11:15:30.404 INFO SelectVariants - Requester pays: disabled
## 11:15:30.404 INFO SelectVariants - Initializing engine
## 11:15:30.526 INFO FeatureManager - Using codec VCFCodec to read file file:///Volumes/Bumblebee/O.lurida_QuantSeq-2020/notebooks/../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered.vcf.gz
## 11:15:30.614 INFO SelectVariants - Done initializing engine
## 11:15:30.644 INFO ProgressMeter - Starting traversal
## 11:15:30.644 INFO ProgressMeter - Current Locus Elapsed Minutes Variants Processed Variants/Minute
## 11:15:40.685 INFO ProgressMeter - Contig143780_150710:12365183 0.2 124000 741035.9
## 11:15:50.731 INFO ProgressMeter - Contig41068_47193:11020689 0.3 275000 821426.8
## 11:16:00.783 INFO ProgressMeter - Contig92931_98526:20188607 0.5 430000 856033.7
## 11:16:01.420 INFO ProgressMeter - Contig98527_104152:27381330 0.5 439968 857748.9
## 11:16:01.420 INFO ProgressMeter - Traversal complete. Processed 439968 total variants in 0.5 minutes.
## 11:16:01.470 INFO SelectVariants - Shutting down engine
## [April 21, 2021 at 11:16:01 AM PDT] org.broadinstitute.hellbender.tools.walkers.variantutils.SelectVariants done. Elapsed time: 0.52 minutes.
## Runtime.totalMemory()=322961408
#snpgdsClose(genofile.larvae)
vcf.fn.larvae <- "../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered-true.vcf.gz"
snpgdsVCF2GDS(vcf.fn.larvae, "../qc-processing/gatk/genotypes-larvae.gds", method="biallelic.only")
## Start file conversion from VCF to SNP GDS ...
## Method: exacting biallelic SNPs
## Number of samples: 76
## Parsing "../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered-true.vcf.gz" ...
## import 128222 variants.
## + genotype { Bit2 76x128222, 2.3M } *
## Optimize the access efficiency ...
## Clean up the fragments of GDS file:
## open the file '../qc-processing/gatk/genotypes-larvae.gds' (2.9M)
## # of fragments: 59
## save to '../qc-processing/gatk/genotypes-larvae.gds.tmp'
## rename '../qc-processing/gatk/genotypes-larvae.gds.tmp' (2.9M, reduced: 468B)
## # of fragments: 20
snpgdsSummary("../qc-processing/gatk/genotypes-larvae.gds")
## The file name: /Volumes/Bumblebee/O.lurida_QuantSeq-2020/qc-processing/gatk/genotypes-larvae.gds
## The total number of samples: 76
## The total number of SNPs: 128222
## SNP genotypes are stored in SNP-major mode (Sample X SNP).
(genofile.larvae <- snpgdsOpen("../qc-processing/gatk/genotypes-larvae.gds"))
## File: /Volumes/Bumblebee/O.lurida_QuantSeq-2020/qc-processing/gatk/genotypes-larvae.gds (2.9M)
## + [ ] *
## |--+ sample.id { Str8 76 LZMA_ra(63.8%), 201B }
## |--+ snp.id { Int32 128222 LZMA_ra(6.56%), 32.8K }
## |--+ snp.rs.id { Str8 128222 LZMA_ra(0.13%), 173B }
## |--+ snp.position { Int32 128222 LZMA_ra(42.0%), 210.2K }
## |--+ snp.chromosome { Str8 128222 LZMA_ra(0.04%), 857B }
## |--+ snp.allele { Str8 128222 LZMA_ra(12.0%), 60.2K }
## |--+ genotype { Bit2 76x128222, 2.3M } *
## \--+ snp.annot [ ]
## |--+ qual { Float32 128222 LZMA_ra(65.7%), 328.9K }
## \--+ filter { Str8 128222 LZMA_ra(0.04%), 253B }
snpset.larvae <- snpgdsLDpruning(genofile.larvae, maf=.10, missing.rate=0.10, autosome.only=FALSE)
## SNP pruning based on LD:
## Excluding 128,154 SNPs (monomorphic: TRUE, MAF: 0.1, missing rate: 0.1)
## # of samples: 76
## # of SNPs: 68
## using 1 thread
## sliding window: 500,000 basepairs, Inf SNPs
## |LD| threshold: 0.2
## method: composite
## Chromosome Contig0_5313: 0.09%, 14/15,947
## Chromosome Contig104153_109739: 0.06%, 1/1,625
## Chromosome Contig11179_21262: 0.02%, 3/12,305
## Chromosome Contig116188_123130: 0.08%, 1/1,207
## Chromosome Contig129983_136896: 0.08%, 1/1,222
## Chromosome Contig136897_143777: 0.15%, 2/1,321
## Chromosome Contig143780_150710: 0.17%, 2/1,179
## Chromosome Contig181898_197428: 0.09%, 1/1,122
## Chromosome Contig21263_28135: 0.04%, 5/13,677
## Chromosome Contig28136_34745: 0.03%, 4/12,701
## Chromosome Contig34748_41067: 0.06%, 6/10,383
## Chromosome Contig398123_696856: 0.10%, 1/992
## Chromosome Contig41068_47193: 0.09%, 7/7,913
## Chromosome Contig47194_53180: 0.04%, 3/6,968
## Chromosome Contig5314_11178: 0.03%, 1/3,691
## Chromosome Contig53181_59083: 0.05%, 3/5,690
## Chromosome Contig59084_64831: 0.09%, 4/4,432
## Chromosome Contig81766_87352: 0.11%, 3/2,843
## Chromosome Contig87353_92929: 0.05%, 1/1,952
## Chromosome Contig92931_98526: 0.05%, 1/1,900
## Chromosome Contig98527_104152: 0.05%, 1/1,979
## 65 markers are selected in total.
snpset.larvae.id <- unlist(unname(snpset.larvae))
# PCA
pca.larvae <- snpgdsPCA(genofile.larvae, snp.id=snpset.larvae.id, num.thread=2, autosome.only=FALSE)
## Principal Component Analysis (PCA) on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 76
## # of SNPs: 65
## using 2 threads
## # of principal components: 32
## PCA: the sum of all selected genotypes (0,1,2) = 7538
## CPU capabilities: Double-Precision SSE2
## Wed Apr 21 11:16:07 2021 (internal increment: 13312)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
## Wed Apr 21 11:16:07 2021 Begin (eigenvalues and eigenvectors)
## Wed Apr 21 11:16:07 2021 Done.
pc.percent.larvae <- pca.larvae$varprop*100
head(round(pc.percent.larvae, 2))
## [1] 9.98 7.72 5.62 5.11 4.57 4.19
tab.larvae <- data.frame(sample.id = pca.larvae$sample.id,
EV1 = pca.larvae$eigenvect[,1], # the first eigenvector
EV2 = pca.larvae$eigenvect[,2], # the second eigenvector
EV3 = pca.larvae$eigenvect[,3], # the third eigenvector
EV4 = pca.larvae$eigenvect[,4], # the fourth eigenvector
EV5 = pca.larvae$eigenvect[,5], # the fourth eigenvector
stringsAsFactors = FALSE)
tab.larvae.annot <- left_join(tab.larvae, sample.info.larvae, by=c("sample.id"="sample")) %>% droplevels()
group.larvae <- as.factor(tab.larvae.annot$pop_code)
lbls <- paste("PC", 1:5, "\n", format(pc.percent.larvae[1:5], digits=2), "%", sep="")
pairs(pca.larvae$eigenvect[,1:5], col=as.color(tab.larvae.annot$pCO2.parent), labels=lbls, main="colors = pCO2 only")
pairs(pca.larvae$eigenvect[,1:5], col=as.color(tab.larvae.annot$pop_code), labels=lbls, main="colors = pop & pCO2")
Interactive PCA biplots
# PC1 x PC2
# uncomment out the ggplotly lines to interact
#ggplotly(
ggplot(tab.larvae.annot, aes(x=EV2, y=EV1, col=population, shape=pCO2.parent)) +
geom_point(size=4) + theme_minimal() + ggtitle("PC1 x PC2\nLarvae Genetic Structure (pooled batches)")#)
#PC1 x PC3
# uncomment out the ggplotly lines to interact
ggplotly(
ggplot(tab.larvae.annot, aes(x=EV3, y=EV1, col=population, shape=pCO2.parent)) +
geom_point(size=4) + theme_minimal() + ggtitle("PC1 x PC3\nLarvae Genetic Structure (pooled batches)"))
# PC1 x PC4
ggplotly(
ggplot(tab.larvae.annot, aes(x=EV4, y=EV2, col=population, shape=pCO2.parent)) +
geom_point(size=4) + theme_minimal() + ggtitle("PC1 x PC4\nLarvae Gene Expression (pooled batches)"))
# PC1 x PC5
ggplotly(
ggplot(tab.larvae.annot, aes(x=EV5, y=EV1, col=population, shape=pCO2.parent)) +
geom_point(size=4) + theme_minimal() + ggtitle("PC1 x PC5\nLarvae Gene Expression (pooled batches)"))
save(tab.larvae.annot, file="../results/tab.larvae.annot")
# To perform cluster analysis on the matrix of genome-wide IBS pairwise distances,
# and determine the groups by a permutation score.
ibs.hc.larvae <- snpgdsHCluster(snpgdsIBS(genofile.larvae, num.thread=2, snp.id=snpset.larvae.id, autosome.only=FALSE,
sample.id=setdiff(tab.larvae.annot$sample.id, "044")))
## Identity-By-State (IBS) analysis on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 75
## # of SNPs: 65
## using 2 threads
## IBS: the sum of all selected genotypes (0,1,2) = 7516
## Wed Apr 21 11:16:08 2021 (internal increment: 65536)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
## Wed Apr 21 11:16:08 2021 Done.
rv.larvae <- snpgdsCutTree(ibs.hc.larvae)
## Determine groups by permutation (Z threshold: 15, outlier threshold: 5):
## Create 1 groups.
plot(rv.larvae$dendrogram, leaflab="none", main="HapMap Phase II")
# Determine groups of individuals by population information
group.larvae <- as.factor(tab.larvae.annot$pop_code)
rv2.larvae <- snpgdsCutTree(ibs.hc.larvae, samp.group=as.factor(subset(tab.larvae.annot, sample.id!="044")$pop_code))
## Create 8 groups.
plot(rv2.larvae$dendrogram, leaflab="none", main="HapMap Phase II")
legend("top", legend=levels(group.larvae), col=1:nlevels(group.larvae), pch=19, ncol=4, cex=0.6, pt.cex=1)
## Larval Allele frequencies
AF.larvae.FB.high <- snpgdsSNPRateFreq(genofile.larvae, snp.id=snpset.larvae.id,
sample.id=subset(tab.larvae.annot, population=="Fidalgo Bay" & pCO2.parent=="High")$sample.id,
with.id = TRUE)
AF.larvae.FB.amb <- snpgdsSNPRateFreq(genofile.larvae, snp.id=snpset.larvae.id,
sample.id=subset(tab.larvae.annot, population=="Fidalgo Bay" & pCO2.parent=="Ambient")$sample.id,
with.id = TRUE)
AF.larvae.DB.high <- snpgdsSNPRateFreq(genofile.larvae, snp.id=snpset.larvae.id,
sample.id=subset(tab.larvae.annot, population=="Dabob Bay" & pCO2.parent=="High")$sample.id,
with.id = TRUE)
AF.larvae.DB.amb <- snpgdsSNPRateFreq(genofile.larvae, snp.id=snpset.larvae.id,
sample.id=subset(tab.larvae.annot, population=="Dabob Bay" & pCO2.parent=="Ambient")$sample.id,
with.id = TRUE)
AF.larvae.OB1.high <- snpgdsSNPRateFreq(genofile.larvae, snp.id=snpset.larvae.id,
sample.id=subset(tab.larvae.annot, population=="Oyster Bay C1" & pCO2.parent=="High")$sample.id,
with.id = TRUE)
AF.larvae.OB1.amb <- snpgdsSNPRateFreq(genofile.larvae, snp.id=snpset.larvae.id,
sample.id=subset(tab.larvae.annot, population=="Oyster Bay C1" & pCO2.parent=="Ambient")$sample.id,
with.id = TRUE)
AF.larvae.OB2.high <- snpgdsSNPRateFreq(genofile.larvae, snp.id=snpset.larvae.id,
sample.id=subset(tab.larvae.annot, population=="Oyster Bay C2" & pCO2.parent=="High")$sample.id,
with.id = TRUE)
AF.larvae.OB2.amb <- snpgdsSNPRateFreq(genofile.larvae, snp.id=snpset.larvae.id,
sample.id=subset(tab.larvae.annot, population=="Oyster Bay C2" & pCO2.parent=="Ambient")$sample.id,
with.id = TRUE)
AF.larvae <- rbind(do.call(cbind.data.frame, AF.larvae.FB.high[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("FB"), stage=as.factor("larvae"), pCO2=as.factor("high")),
do.call(cbind.data.frame, AF.larvae.FB.amb[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("FB"), stage=as.factor("larvae"), pCO2=as.factor("ambient")),
do.call(cbind.data.frame, AF.larvae.DB.high[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("DB"), stage=as.factor("larvae"), pCO2=as.factor("high")),
do.call(cbind.data.frame, AF.larvae.DB.amb[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("DB"), stage=as.factor("larvae"), pCO2=as.factor("ambient")),
do.call(cbind.data.frame, AF.larvae.OB1.high[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("OB1"), stage=as.factor("larvae"), pCO2=as.factor("high")),
do.call(cbind.data.frame, AF.larvae.OB1.amb[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("OB1"), stage=as.factor("larvae"), pCO2=as.factor("ambient")),
do.call(cbind.data.frame, AF.larvae.OB2.high[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("OB2"), stage=as.factor("larvae"), pCO2=as.factor("high")),
do.call(cbind.data.frame, AF.larvae.OB2.amb[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("OB2"), stage=as.factor("larvae"), pCO2=as.factor("ambient")))
ggplot(AF.larvae, aes(x=pCO2, y=AlleleFreq, fill=population, alpha=pCO2)) + geom_violin(trim = F) + facet_wrap(~population, ncol=4) + theme_minimal() +scale_alpha_discrete(range = c(0.35, 0.9)) +
stat_summary(fun.y=mean, geom="point", shape=8, size=4, color="black", fill="black") + xlab("pCO2 exposure") + ylab("Allele Frequency") +
ggtitle(paste("SNP Allele Frequency, larvaes by population & pCO2 treatment\n", "(No. loci = ", length(snpset.larvae.id), ")", sep="")) +
geom_text(data=AF.larvae %>% group_by(population, pCO2) %>% summarize(mean=mean(AlleleFreq)), aes(x=pCO2, y=mean+.1, label=round(mean, 2))) + theme(legend.position = "none")
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
# within FB (comparing pCO2 exposure):
Fst.larvae.FB <- snpgdsFst(genofile.larvae, snp.id=snpset.larvae.id,
sample.id=subset(tab.larvae.annot, population=="Fidalgo Bay")$sample.id,
population=as.factor(subset(tab.larvae.annot, population=="Fidalgo Bay")$pCO2.parent),
method="W&C84", autosome.only=FALSE, with.id = TRUE)
## Fst estimation on genotypes:
## Excluding 7 SNPs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 17
## # of SNPs: 58
## Method: Weir & Cockerham, 1984
## # of Populations: 2
## Ambient (7), High (10)
paste("Weir and Cockerham mean Fst estimate, Fidalgo Bay Ambient vs. High: ", round(Fst.larvae.FB$MeanFst, 5))
## [1] "Weir and Cockerham mean Fst estimate, Fidalgo Bay Ambient vs. High: 0.00852"
hist(Fst.larvae.FB$FstSNP, breaks = 75, main="Fst distribution\nFidalgo Bay Ambient vs. High, Larvae", col="dodgerblue3", xlim = c(-.15, 0.8))
# within DB (comparing pCO2 exposure):
Fst.larvae.DB <- snpgdsFst(genofile.larvae, snp.id=snpset.larvae.id,
sample.id=subset(tab.larvae.annot, population=="Dabob Bay")$sample.id,
population=as.factor(subset(tab.larvae.annot, population=="Dabob Bay")$pCO2.parent),
method="W&C84", autosome.only=FALSE, with.id = TRUE)
## Fst estimation on genotypes:
## Excluding 18 SNPs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 12
## # of SNPs: 47
## Method: Weir & Cockerham, 1984
## # of Populations: 2
## Ambient (5), High (7)
paste("Weir and Cockerham mean Fst estimate, Dabob Bay Ambient vs. High: ", round(Fst.larvae.DB$MeanFst, 5))
## [1] "Weir and Cockerham mean Fst estimate, Dabob Bay Ambient vs. High: 0.01232"
hist(Fst.larvae.DB$FstSNP, breaks = 100, main="Fst distribution\nDabob Bay Ambient vs. High, Larvae", col="darkseagreen3", xlim = c(-.2, .8))
# within OB1 (comparing pCO2 exposure):
Fst.larvae.OB1 <- snpgdsFst(genofile.larvae, snp.id=snpset.larvae.id,
sample.id=subset(tab.larvae.annot, population=="Oyster Bay C1")$sample.id,
population=as.factor(subset(tab.larvae.annot, population=="Oyster Bay C1")$pCO2.parent),
method="W&C84", autosome.only=FALSE, with.id = TRUE)
## Fst estimation on genotypes:
## Excluding 2 SNPs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 32
## # of SNPs: 63
## Method: Weir & Cockerham, 1984
## # of Populations: 2
## Ambient (16), High (16)
paste("Weir and Cockerham mean Fst estimate, Oyster Bay C1 Ambient vs. High: ", round(Fst.larvae.OB1$MeanFst, 5))
## [1] "Weir and Cockerham mean Fst estimate, Oyster Bay C1 Ambient vs. High: 0.00046"
hist(Fst.larvae.OB1$FstSNP, breaks = 100, main="Fst distribution\nOyster Bay C1 Ambient vs. High, Larvae", col="salmon2", xlim = c(-.2, 0.8))
Fst.larvae <- rbind(do.call(cbind.data.frame, Fst.larvae.FB[c("snp.id", "FstSNP")]) %>% mutate(population=as.factor("FB"), stage=as.factor("larvae")),
do.call(cbind.data.frame, Fst.larvae.DB[c("snp.id", "FstSNP")]) %>% mutate(population=as.factor("DB"), stage=as.factor("larvae")),
do.call(cbind.data.frame, Fst.larvae.OB1[c("snp.id", "FstSNP")]) %>% mutate(population=as.factor("OB1"), stage=as.factor("larvae")))
ggplot(Fst.larvae, aes(x=population, y=FstSNP, fill=population)) + geom_violin(trim = F) +
stat_summary(fun.y=mean, geom="point", shape=8, size=3.5, color="black", fill="black") +
ggtitle(paste("SNP Fst, Larvae by pCO2 treatment\n", "(No. loci = ", length(snpset.larvae.id), ")", sep="")) +
geom_text(data=Fst.larvae %>% group_by(population) %>% summarize(mean=mean(FstSNP)), aes(x=population, y=.78, label=round(mean, 3))) +
theme_minimal()
## Warning: `fun.y` is deprecated. Use `fun` instead.
Larval Identify-by-state
ibs.larvae <- snpgdsIBS(genofile.larvae, num.thread=2, snp.id=snpset.larvae.id, autosome.only=FALSE, sample.id=setdiff(tab.larvae.annot$sample.id, "044"))
## Identity-By-State (IBS) analysis on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 75
## # of SNPs: 65
## using 2 threads
## IBS: the sum of all selected genotypes (0,1,2) = 7516
## Wed Apr 21 11:16:10 2021 (internal increment: 65536)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
## Wed Apr 21 11:16:10 2021 Done.
pop.idx.larvae <- order(tab.larvae.annot$pop_code)
#perform multidimensional scaling analysis on the matrix of genome-wide IBS pairwise distances:
loc.larvae <- cmdscale(1 - ibs.larvae$ibs, k = 2)
x.larvae <- loc.larvae[, 1]; y.larvae <- loc.larvae[, 2]
a <- as.factor(subset(tab.larvae.annot, sample.id!="044")$population)
b <- as.numeric(as.factor(subset(tab.larvae.annot, sample.id!="044")$pCO2.parent))
par(xpd=TRUE)
plot(x.larvae, y.larvae, col=group.larvae, xlab = "", ylab = "",
main = "Multidimensional Scaling Analysis (IBS)", cex=2, pch=16)
#text(x.larvae, y.larvae+.01, labels=tab.larvae.annot$sample.id, cex=0.7, font=2, col=as.integer(group.larvae))
legend("bottomright", inset=c(-.065,0), legend=levels(group.larvae), cex=.6, pch="o", text.col=1:nlevels(group.larvae))
# Option to plot color coded by pop, symbol by pCO2
#plot(x.larvae, y.larvae, col=a, pch=b, xlab = "", ylab = "",
# main = "Multidimensional Scaling Analysis (IBS)", cex=1)
/Applications/bioinformatics/gatk-4.1.9.0/gatk VariantFiltration \
-R ../references/Olurida_v081_concat_rehead.fa \
-V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile.vcf.gz \
-O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered.vcf.gz \
--filter-name "FS" \
--filter "FS > 60.0" \
--filter-name "QUAL30" \
--filter "QUAL < 30.0" \
--filter-name "SOR3" \
--filter "SOR > 3.0" \
--filter-name "DP50" \
--filter "DP < 50" \
--filter-name "DP450" \
--filter "DP > 450"
## Using GATK jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar
## Running:
## java -Dsamjdk.use_async_io_read_samtools=false -Dsamjdk.use_async_io_write_samtools=true -Dsamjdk.use_async_io_write_tribble=false -Dsamjdk.compression_level=2 -jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar VariantFiltration -R ../references/Olurida_v081_concat_rehead.fa -V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile.vcf.gz -O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered.vcf.gz --filter-name FS --filter FS > 60.0 --filter-name QUAL30 --filter QUAL < 30.0 --filter-name SOR3 --filter SOR > 3.0 --filter-name DP50 --filter DP < 50 --filter-name DP450 --filter DP > 450
## 11:16:12.229 INFO NativeLibraryLoader - Loading libgkl_compression.dylib from jar:file:/Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar!/com/intel/gkl/native/libgkl_compression.dylib
## Apr 21, 2021 11:16:12 AM shaded.cloud_nio.com.google.auth.oauth2.ComputeEngineCredentials runningOnComputeEngine
## INFO: Failed to detect whether we are running on Google Compute Engine.
## 11:16:12.410 INFO VariantFiltration - ------------------------------------------------------------
## 11:16:12.410 INFO VariantFiltration - The Genome Analysis Toolkit (GATK) v4.1.9.0
## 11:16:12.410 INFO VariantFiltration - For support and documentation go to https://software.broadinstitute.org/gatk/
## 11:16:12.410 INFO VariantFiltration - Executing as laura@lauras-MacBook-Pro-2.local on Mac OS X v10.15.7 x86_64
## 11:16:12.410 INFO VariantFiltration - Java runtime: Java HotSpot(TM) 64-Bit Server VM v11.0.1+13-LTS
## 11:16:12.410 INFO VariantFiltration - Start Date/Time: April 21, 2021 at 11:16:12 AM PDT
## 11:16:12.410 INFO VariantFiltration - ------------------------------------------------------------
## 11:16:12.410 INFO VariantFiltration - ------------------------------------------------------------
## 11:16:12.411 INFO VariantFiltration - HTSJDK Version: 2.23.0
## 11:16:12.411 INFO VariantFiltration - Picard Version: 2.23.3
## 11:16:12.411 INFO VariantFiltration - HTSJDK Defaults.COMPRESSION_LEVEL : 2
## 11:16:12.411 INFO VariantFiltration - HTSJDK Defaults.USE_ASYNC_IO_READ_FOR_SAMTOOLS : false
## 11:16:12.411 INFO VariantFiltration - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_SAMTOOLS : true
## 11:16:12.411 INFO VariantFiltration - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_TRIBBLE : false
## 11:16:12.411 INFO VariantFiltration - Deflater: IntelDeflater
## 11:16:12.411 INFO VariantFiltration - Inflater: IntelInflater
## 11:16:12.411 INFO VariantFiltration - GCS max retries/reopens: 20
## 11:16:12.411 INFO VariantFiltration - Requester pays: disabled
## 11:16:12.411 INFO VariantFiltration - Initializing engine
## 11:16:12.538 INFO FeatureManager - Using codec VCFCodec to read file file:///Volumes/Bumblebee/O.lurida_QuantSeq-2020/notebooks/../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile.vcf.gz
## 11:16:12.609 INFO VariantFiltration - Done initializing engine
## 11:16:12.664 INFO ProgressMeter - Starting traversal
## 11:16:12.665 INFO ProgressMeter - Current Locus Elapsed Minutes Variants Processed Variants/Minute
## 11:16:16.927 INFO ProgressMeter - Contig98527_104152:26467437 0.1 280822 3953383.4
## 11:16:16.928 INFO ProgressMeter - Traversal complete. Processed 280822 total variants in 0.1 minutes.
## 11:16:16.978 INFO VariantFiltration - Shutting down engine
## [April 21, 2021 at 11:16:16 AM PDT] org.broadinstitute.hellbender.tools.walkers.filters.VariantFiltration done. Elapsed time: 0.08 minutes.
## Runtime.totalMemory()=387973120
/Applications/bioinformatics/gatk-4.1.9.0/gatk SelectVariants \
-R ../references/Olurida_v081_concat_rehead.fa \
-V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered.vcf.gz \
--exclude-filtered TRUE \
--select-type-to-include SNP \
-O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered-true.vcf.gz
## Using GATK jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar
## Running:
## java -Dsamjdk.use_async_io_read_samtools=false -Dsamjdk.use_async_io_write_samtools=true -Dsamjdk.use_async_io_write_tribble=false -Dsamjdk.compression_level=2 -jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar SelectVariants -R ../references/Olurida_v081_concat_rehead.fa -V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered.vcf.gz --exclude-filtered TRUE --select-type-to-include SNP -O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered-true.vcf.gz
## 11:16:18.636 INFO NativeLibraryLoader - Loading libgkl_compression.dylib from jar:file:/Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar!/com/intel/gkl/native/libgkl_compression.dylib
## Apr 21, 2021 11:16:18 AM shaded.cloud_nio.com.google.auth.oauth2.ComputeEngineCredentials runningOnComputeEngine
## INFO: Failed to detect whether we are running on Google Compute Engine.
## 11:16:18.843 INFO SelectVariants - ------------------------------------------------------------
## 11:16:18.843 INFO SelectVariants - The Genome Analysis Toolkit (GATK) v4.1.9.0
## 11:16:18.843 INFO SelectVariants - For support and documentation go to https://software.broadinstitute.org/gatk/
## 11:16:18.843 INFO SelectVariants - Executing as laura@lauras-MacBook-Pro-2.local on Mac OS X v10.15.7 x86_64
## 11:16:18.843 INFO SelectVariants - Java runtime: Java HotSpot(TM) 64-Bit Server VM v11.0.1+13-LTS
## 11:16:18.843 INFO SelectVariants - Start Date/Time: April 21, 2021 at 11:16:18 AM PDT
## 11:16:18.843 INFO SelectVariants - ------------------------------------------------------------
## 11:16:18.843 INFO SelectVariants - ------------------------------------------------------------
## 11:16:18.844 INFO SelectVariants - HTSJDK Version: 2.23.0
## 11:16:18.844 INFO SelectVariants - Picard Version: 2.23.3
## 11:16:18.844 INFO SelectVariants - HTSJDK Defaults.COMPRESSION_LEVEL : 2
## 11:16:18.844 INFO SelectVariants - HTSJDK Defaults.USE_ASYNC_IO_READ_FOR_SAMTOOLS : false
## 11:16:18.844 INFO SelectVariants - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_SAMTOOLS : true
## 11:16:18.844 INFO SelectVariants - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_TRIBBLE : false
## 11:16:18.844 INFO SelectVariants - Deflater: IntelDeflater
## 11:16:18.844 INFO SelectVariants - Inflater: IntelInflater
## 11:16:18.844 INFO SelectVariants - GCS max retries/reopens: 20
## 11:16:18.844 INFO SelectVariants - Requester pays: disabled
## 11:16:18.845 INFO SelectVariants - Initializing engine
## 11:16:18.962 INFO FeatureManager - Using codec VCFCodec to read file file:///Volumes/Bumblebee/O.lurida_QuantSeq-2020/notebooks/../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered.vcf.gz
## 11:16:19.031 INFO SelectVariants - Done initializing engine
## 11:16:19.064 INFO ProgressMeter - Starting traversal
## 11:16:19.064 INFO ProgressMeter - Current Locus Elapsed Minutes Variants Processed Variants/Minute
## 11:16:25.581 INFO ProgressMeter - Contig98527_104152:24952552 0.1 222831 2051850.8
## 11:16:25.581 INFO ProgressMeter - Traversal complete. Processed 222831 total variants in 0.1 minutes.
## 11:16:25.610 INFO SelectVariants - Shutting down engine
## [April 21, 2021 at 11:16:25 AM PDT] org.broadinstitute.hellbender.tools.walkers.variantutils.SelectVariants done. Elapsed time: 0.12 minutes.
## Runtime.totalMemory()=387973120
#snpgdsClose(genofile.juvenile)
vcf.fn.juvenile <- "../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered-true.vcf.gz"
snpgdsVCF2GDS(vcf.fn.juvenile, "../qc-processing/gatk/genotypes-juvenile.gds", method="biallelic.only")
## Start file conversion from VCF to SNP GDS ...
## Method: exacting biallelic SNPs
## Number of samples: 16
## Parsing "../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered-true.vcf.gz" ...
## import 50608 variants.
## + genotype { Bit2 16x50608, 197.7K } *
## Optimize the access efficiency ...
## Clean up the fragments of GDS file:
## open the file '../qc-processing/gatk/genotypes-juvenile.gds' (459.9K)
## # of fragments: 49
## save to '../qc-processing/gatk/genotypes-juvenile.gds.tmp'
## rename '../qc-processing/gatk/genotypes-juvenile.gds.tmp' (459.5K, reduced: 348B)
## # of fragments: 20
snpgdsSummary("../qc-processing/gatk/genotypes-juvenile.gds")
## The file name: /Volumes/Bumblebee/O.lurida_QuantSeq-2020/qc-processing/gatk/genotypes-juvenile.gds
## The total number of samples: 16
## The total number of SNPs: 50608
## SNP genotypes are stored in SNP-major mode (Sample X SNP).
(genofile.juvenile <- snpgdsOpen("../qc-processing/gatk/genotypes-juvenile.gds"))
## File: /Volumes/Bumblebee/O.lurida_QuantSeq-2020/qc-processing/gatk/genotypes-juvenile.gds (459.5K)
## + [ ] *
## |--+ sample.id { Str8 16 LZMA_ra(178.1%), 121B }
## |--+ snp.id { Int32 50608 LZMA_ra(7.08%), 14.0K }
## |--+ snp.rs.id { Str8 50608 LZMA_ra(0.30%), 157B }
## |--+ snp.position { Int32 50608 LZMA_ra(43.1%), 85.1K }
## |--+ snp.chromosome { Str8 50608 LZMA_ra(0.07%), 649B }
## |--+ snp.allele { Str8 50608 LZMA_ra(12.3%), 24.2K }
## |--+ genotype { Bit2 16x50608, 197.7K } *
## \--+ snp.annot [ ]
## |--+ qual { Float32 50608 LZMA_ra(68.5%), 135.4K }
## \--+ filter { Str8 50608 LZMA_ra(0.08%), 197B }
snpset.juvenile <- snpgdsLDpruning(genofile.juvenile, maf=.1, missing.rate=0, autosome.only=FALSE)
## SNP pruning based on LD:
## Excluding 41,649 SNPs (monomorphic: TRUE, MAF: 0.1, missing rate: 0)
## # of samples: 16
## # of SNPs: 8,959
## using 1 thread
## sliding window: 500,000 basepairs, Inf SNPs
## |LD| threshold: 0.2
## method: composite
## Chromosome Contig0_5313: 4.89%, 280/5,731
## Chromosome Contig104153_109739: 5.60%, 42/750
## Chromosome Contig109741_116186: 5.22%, 35/670
## Chromosome Contig11179_21262: 4.25%, 191/4,498
## Chromosome Contig116188_123130: 6.38%, 37/580
## Chromosome Contig123131_129982: 5.21%, 22/422
## Chromosome Contig129983_136896: 6.16%, 33/536
## Chromosome Contig136897_143777: 5.40%, 36/667
## Chromosome Contig143780_150710: 6.58%, 32/486
## Chromosome Contig150711_166372: 6.15%, 28/455
## Chromosome Contig166374_181896: 6.55%, 36/550
## Chromosome Contig181898_197428: 4.96%, 21/423
## Chromosome Contig197431_213218: 5.14%, 29/564
## Chromosome Contig21263_28135: 4.63%, 231/4,994
## Chromosome Contig213221_398012: 6.86%, 35/510
## Chromosome Contig28136_34745: 4.34%, 207/4,772
## Chromosome Contig34748_41067: 5.19%, 198/3,817
## Chromosome Contig398123_696856: 6.09%, 29/476
## Chromosome Contig41068_47193: 4.94%, 158/3,201
## Chromosome Contig47194_53180: 5.14%, 145/2,819
## Chromosome Contig5314_11178: 5.02%, 71/1,414
## Chromosome Contig53181_59083: 5.21%, 132/2,534
## Chromosome Contig59084_64831: 5.36%, 96/1,792
## Chromosome Contig64832_70524: 5.38%, 89/1,654
## Chromosome Contig70525_76135: 5.71%, 76/1,331
## Chromosome Contig76136_81765: 5.94%, 79/1,329
## Chromosome Contig81766_87352: 5.90%, 71/1,203
## Chromosome Contig87353_92929: 5.89%, 51/866
## Chromosome Contig92931_98526: 5.94%, 47/791
## Chromosome Contig98527_104152: 6.60%, 51/773
## 2,588 markers are selected in total.
snpset.juvenile.id <- unlist(unname(snpset.juvenile))
# PCA
pca.juvenile <- snpgdsPCA(genofile.juvenile, snp.id=snpset.juvenile.id, num.thread=2, autosome.only=FALSE)
## Principal Component Analysis (PCA) on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 16
## # of SNPs: 2,588
## using 2 threads
## # of principal components: 32
## PCA: the sum of all selected genotypes (0,1,2) = 56968
## CPU capabilities: Double-Precision SSE2
## Wed Apr 21 11:16:27 2021 (internal increment: 63232)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
## Wed Apr 21 11:16:27 2021 Begin (eigenvalues and eigenvectors)
## Wed Apr 21 11:16:27 2021 Done.
pc.percent.juvenile <- pca.juvenile$varprop*100
head(round(pc.percent.juvenile, 2))
## [1] 19.31 12.93 8.90 7.30 6.98 6.30
tab.juvenile <- data.frame(sample.id = pca.juvenile$sample.id,
EV1 = pca.juvenile$eigenvect[,1], # the first eigenvector
EV2 = pca.juvenile$eigenvect[,2], # the second eigenvector
EV3 = pca.juvenile$eigenvect[,3], # the third eigenvector
EV4 = pca.juvenile$eigenvect[,4], # the fourth eigenvector
EV5 = pca.juvenile$eigenvect[,5], # the fourth eigenvector
stringsAsFactors = FALSE)
tab.juvenile.annot <- left_join(tab.juvenile, sample.info.juvenile, by=c("sample.id"="sample")) %>% droplevels()
group.juvenile <- as.factor(tab.juvenile.annot$pop_code)
lbls <- paste("PC", 1:5, "\n", format(pc.percent.juvenile[1:5], digits=2), "%", sep="")
pairs(pca.juvenile$eigenvect[,1:5], col=as.color(tab.juvenile.annot$pCO2.parent), labels=lbls, main="colors = pCO2 only")
pairs(pca.juvenile$eigenvect[,1:5], col=as.color(tab.juvenile.annot$pop_code), labels=lbls, main="colors = pop & pCO2")
Interactive PCAs
# PC1 x PC2
# uncomment out the ggplotly lines to interact
ggplotly(
ggplot(tab.juvenile.annot, aes(x=EV2, y=EV1, col=population, shape=pCO2.parent)) +
geom_point(size=4) + theme_minimal() + ggtitle("PC1 x PC2\njuvenile Genetic Structure (pooled batches)"))
#PC1 x PC3
# uncomment out the ggplotly lines to interact
ggplotly(
ggplot(tab.juvenile.annot, aes(x=EV3, y=EV1, col=population, shape=pCO2.parent)) +
geom_point(size=4) + theme_minimal() + ggtitle("PC1 x PC3\njuvenile Genetic Structure (pooled batches)"))
#PC2 x PC3
# uncomment out the ggplotly lines to interact
ggplotly(
ggplot(tab.juvenile.annot, aes(x=EV3, y=EV2, col=population, shape=pCO2.parent)) +
geom_point(size=4) + theme_minimal() + ggtitle("PC2 x PC3\njuvenile Genetic Structure (pooled batches)"))
save(tab.juvenile.annot, file="../results/tab.juvenile.annot")
# To perform cluster analysis on the matrix of genome-wide IBS pairwise distances,
# and determine the groups by a permutation score.
ibs.hc.juvenile <- snpgdsHCluster(snpgdsIBS(genofile.juvenile, num.thread=2, snp.id=snpset.juvenile.id, autosome.only=FALSE,
sample.id=setdiff(tab.juvenile.annot$sample.id, "044")))
## Identity-By-State (IBS) analysis on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 16
## # of SNPs: 2,588
## using 2 threads
## IBS: the sum of all selected genotypes (0,1,2) = 56968
## Wed Apr 21 11:16:27 2021 (internal increment: 65536)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
## Wed Apr 21 11:16:27 2021 Done.
rv.juvenile <- snpgdsCutTree(ibs.hc.juvenile)
## Determine groups by permutation (Z threshold: 15, outlier threshold: 5):
## Create 1 groups.
plot(rv.juvenile$dendrogram, leaflab="none", main="HapMap Phase II")
# Determine groups of individuals by population information
rv2.juvenile <- snpgdsCutTree(ibs.hc.juvenile, samp.group=as.factor(subset(tab.juvenile.annot, sample.id!="044")$pop_code))
## Create 4 groups.
plot(rv2.juvenile$dendrogram, leaflab="none", main="HapMap Phase II")
legend("top", legend=levels(group.juvenile), col=1:nlevels(group.juvenile), pch=19, ncol=4, cex=0.6, pt.cex=1)
## Juvenile Allele frequencies
AF.juvenile.FB.high <- snpgdsSNPRateFreq(genofile.juvenile, snp.id=snpset.juvenile.id,
sample.id=subset(tab.juvenile.annot, population=="Fidalgo Bay" & pCO2.parent=="High")$sample.id,
with.id = TRUE)
AF.juvenile.FB.amb <- snpgdsSNPRateFreq(genofile.juvenile, snp.id=snpset.juvenile.id,
sample.id=subset(tab.juvenile.annot, population=="Fidalgo Bay" & pCO2.parent=="Ambient")$sample.id,
with.id = TRUE)
AF.juvenile.DB.high <- snpgdsSNPRateFreq(genofile.juvenile, snp.id=snpset.juvenile.id,
sample.id=subset(tab.juvenile.annot, population=="Dabob Bay" & pCO2.parent=="High")$sample.id,
with.id = TRUE)
AF.juvenile.DB.amb <- snpgdsSNPRateFreq(genofile.juvenile, snp.id=snpset.juvenile.id,
sample.id=subset(tab.juvenile.annot, population=="Dabob Bay" & pCO2.parent=="Ambient")$sample.id,
with.id = TRUE)
AF.juveniles <- rbind(do.call(cbind.data.frame, AF.juvenile.FB.high[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("FB"), stage=as.factor("juvenile"), pCO2=as.factor("high")),
do.call(cbind.data.frame, AF.juvenile.FB.amb[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("FB"), stage=as.factor("juvenile"), pCO2=as.factor("ambient")),
do.call(cbind.data.frame, AF.juvenile.DB.high[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("DB"), stage=as.factor("juvenile"), pCO2=as.factor("high")),
do.call(cbind.data.frame, AF.juvenile.DB.amb[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("DB"), stage=as.factor("juvenile"), pCO2=as.factor("ambient")))
ggplot(AF.juveniles, aes(x=pCO2, y=AlleleFreq, fill=population, alpha=pCO2)) + geom_violin(trim = F) + facet_wrap(~population) + theme_minimal() +scale_alpha_discrete(range = c(0.35, 0.9)) +
stat_summary(fun.y=mean, geom="point", shape=8, size=4, color="black", fill="black") + xlab("pCO2 exposure") + ylab("Allele Frequency") +
ggtitle(paste("SNP Allele Frequency, juveniles by population & pCO2 treatment\n", "(No. loci = ", length(snpset.juvenile.id), ")", sep="")) +
geom_text(data=AF.juveniles %>% group_by(population, pCO2) %>% summarize(mean=mean(AlleleFreq)), aes(x=pCO2, y=mean+.1, label=round(mean, 2)))
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
# within FB (comparing pCO2 exposure):
Fst.juvenile.FB <- snpgdsFst(genofile.juvenile, snp.id=snpset.juvenile.id,
sample.id=subset(tab.juvenile.annot, population=="Fidalgo Bay")$sample.id,
population=as.factor(subset(tab.juvenile.annot, population=="Fidalgo Bay")$pCO2.parent),
method="W&C84", autosome.only=FALSE, with.id = TRUE)
## Fst estimation on genotypes:
## Excluding 261 SNPs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 8
## # of SNPs: 2,327
## Method: Weir & Cockerham, 1984
## # of Populations: 2
## Ambient (4), High (4)
paste("Weir and Cockerham mean Fst estimate, Fidalgo Bay Ambient vs. High: ", round(Fst.juvenile.FB$MeanFst, 5))
## [1] "Weir and Cockerham mean Fst estimate, Fidalgo Bay Ambient vs. High: 0.03554"
hist(Fst.juvenile.FB$FstSNP, breaks = 75, main="Fst distribution\nFidalgo Bay Ambient vs. High, juvenile", col="dodgerblue3", xlim = c(-.15, 0.8))
# within DB (comparing pCO2 exposure):
Fst.juvenile.DB <- snpgdsFst(genofile.juvenile, snp.id=snpset.juvenile.id,
sample.id=subset(tab.juvenile.annot, population=="Dabob Bay")$sample.id,
population=as.factor(subset(tab.juvenile.annot, population=="Dabob Bay")$pCO2.parent),
method="W&C84", autosome.only=FALSE, with.id = TRUE)
## Fst estimation on genotypes:
## Excluding 270 SNPs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 8
## # of SNPs: 2,318
## Method: Weir & Cockerham, 1984
## # of Populations: 2
## Ambient (4), High (4)
paste("Weir and Cockerham mean Fst estimate, Dabob Bay Ambient vs. High: ", round(Fst.juvenile.DB$MeanFst, 5))
## [1] "Weir and Cockerham mean Fst estimate, Dabob Bay Ambient vs. High: 0.10518"
hist(Fst.juvenile.DB$FstSNP, breaks = 100, main="Fst distribution\nDabob Bay Ambient vs. High, juvenile", col="darkseagreen3", xlim = c(-.2, .8))
Fst.juvenile <- rbind(do.call(cbind.data.frame, Fst.juvenile.FB[c("snp.id", "FstSNP")]) %>% mutate(population=as.factor("FB"), stage=as.factor("juvenile")),
do.call(cbind.data.frame, Fst.juvenile.DB[c("snp.id", "FstSNP")]) %>% mutate(population=as.factor("DB"), stage=as.factor("juvenile")))
ggplot(Fst.juvenile, aes(x=population, y=FstSNP, fill=population)) + geom_violin(trim = F) +
stat_summary(fun.y=mean, geom="point", shape=8, size=3.5, color="black", fill="black") +
ggtitle(paste("SNP Fst, juvenile by pCO2 treatment\n", "(No. loci = ", length(snpset.juvenile.id), ")", sep="")) +
geom_text(data=Fst.juvenile %>% group_by(population) %>% summarize(mean=mean(FstSNP)), aes(x=population, y=1.1, label=round(mean, 3))) +
theme_minimal()
## Warning: `fun.y` is deprecated. Use `fun` instead.
ibs.juvenile <- snpgdsIBS(genofile.juvenile, num.thread=2, snp.id=snpset.juvenile.id, autosome.only=FALSE, sample.id=setdiff(tab.juvenile.annot$sample.id, "044"))
## Identity-By-State (IBS) analysis on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 16
## # of SNPs: 2,588
## using 2 threads
## IBS: the sum of all selected genotypes (0,1,2) = 56968
## Wed Apr 21 11:16:28 2021 (internal increment: 65536)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
## Wed Apr 21 11:16:28 2021 Done.
pop.idx.juvenile <- order(tab.juvenile.annot$pop_code)
#perform multidimensional scaling analysis on the matrix of genome-wide IBS pairwise distances:
loc.juvenile <- cmdscale(1 - ibs.juvenile$ibs, k = 2)
x.juvenile <- loc.juvenile[, 1]; y.juvenile <- loc.juvenile[, 2]
a <- as.factor(subset(tab.juvenile.annot, sample.id!="044")$population)
b <- as.numeric(as.factor(subset(tab.juvenile.annot, sample.id!="044")$pCO2.parent))
plot(x.juvenile, y.juvenile, col=a, pch=b, xlab = "", ylab = "",
main = "Multidimensional Scaling Analysis (IBS)", cex=1)
#text(x.juvenile, y.juvenile+.012, labels=tab.juvenile.annot$sample.id, cex=0.7, font=2, col=as.integer(group.juvenile))
#legend("topleft", legend=levels(group.juvenile), pch="o", cex=0.8) #text.col=1:nlevels(group.juvenile),
Fst.all <- rbind(Fst.adults, Fst.larvae, Fst.juvenile)
Fst.means <- Fst.all %>% select(-snp.id) %>% group_by(population, stage) %>% summarize(mean=mean(FstSNP), sd=sd(FstSNP), n=n())
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
AF.all <- rbind(AF.adults, AF.larvae, AF.juveniles)
AF.means <- AF.all %>% select(-snp.id) %>% group_by(population, stage, pCO2) %>% summarize(mean=mean(AlleleFreq), sd=sd(AlleleFreq), n=n())
## `summarise()` has grouped output by 'population', 'stage'. You can override using the `.groups` argument.
ggplot(AF.all, aes(x=stage, y=AlleleFreq, fill=pCO2, alpha=stage)) + geom_violin(trim = T) + theme_minimal() +
ggtitle("Allele Frequency\nBy population, stage, & adult pCO2 treatment") + xlab("") + ylab("Allele Frequency") + facet_wrap(~population) +scale_alpha_discrete(range = c(0.85, 0.15)) #+
## Warning: Using alpha for a discrete variable is not advised.
# g0eom_point(data=AF.all %>% group_by(population, stage, pCO2) %>% summarize(mean=mean(AlleleFreq)),
# aes(x=stage:pCO2, y=mean, label=round(mean, 3)), cex=2.25, inherit.aes=T, position = position_dodge(width=1))
ggplot(Fst.all, aes(x=stage, y=FstSNP, fill=stage)) + geom_violin(trim = T, alpha=0.4) + theme_minimal() + facet_wrap(~ population) +
stat_summary(fun.y=mean, geom="point", aes(group=stage), shape=8, size=2, color="black", fill="black") +
ggtitle("Per-SNP Fst among pCO2 treatments\nBy population and stage") + xlab("") + ylab("Per-Locus Fst") +
geom_text(data=Fst.means, aes(group=stage, y=-.33, label=round(mean, 3)), cex=3.25)
## Warning: `fun.y` is deprecated. Use `fun` instead.
ggplot(Fst.all %>% filter(population==c("FB","DB")), aes(x=FstSNP, fill=stage)) + geom_density(alpha=0.4) + theme_minimal() +
ggtitle("Per-SNP Fst among adult pCO2 treatments (high vs. ambient)\nBy population & stage") + xlab("") + ylab("Per-Locus Fst") + facet_wrap(~population)
# More analysis via vcfR - compare allelic richness and heterozygosity by population, stage and parental pCO2 exposure
First filter the vcf files for each stage (adult, larvae, adult) to remove loci with tons if missing data. Do this via vcftools.
I used this tutorial as an example. - call vcftools - feed it a vcf file after the –vcf flag - Set –max-missing 0.90 to keep only loci with 10% missing genotypes across all individuals - The –recode flag tells the program to write a new vcf file with the filters - The –recode-INFO-all keeps all the INFO flags from the old vcf file in the new one. - Lastly, –out designates the name of the output
# adults
vcftools --gzvcf "../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered-true.vcf.gz" \
--max-missing 0.85 --recode --recode-INFO-all --out \
"../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-4vcfR"
##
## VCFtools - 0.1.17
## (C) Adam Auton and Anthony Marcketta 2009
##
## Parameters as interpreted:
## --gzvcf ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered-true.vcf.gz
## --recode-INFO-all
## --max-missing 0.85
## --out ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-4vcfR
## --recode
##
## Using zlib version: 1.2.11
## Warning: Expected at least 2 parts in FORMAT entry: ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another; will always be heterozygous and is not intended to describe called alleles">
## Warning: Expected at least 2 parts in FORMAT entry: ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
## Warning: Expected at least 2 parts in FORMAT entry: ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
## Warning: Expected at least 2 parts in FORMAT entry: ID=RGQ,Number=1,Type=Integer,Description="Unconditional reference genotype confidence, encoded as a phred quality -10*log10 p(genotype call is wrong)">
## Warning: Expected at least 2 parts in INFO entry: ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
## After filtering, kept 53 out of 53 Individuals
## Outputting VCF file...
## After filtering, kept 45063 out of a possible 179693 Sites
## Run Time = 8.00 seconds
# larvae
vcftools --gzvcf "../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered-true.vcf.gz" \
--max-missing 0.85 --recode --recode-INFO-all --out \
"../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-4vcfR"
##
## VCFtools - 0.1.17
## (C) Adam Auton and Anthony Marcketta 2009
##
## Parameters as interpreted:
## --gzvcf ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered-true.vcf.gz
## --recode-INFO-all
## --max-missing 0.85
## --out ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-4vcfR
## --recode
##
## Using zlib version: 1.2.11
## Warning: Expected at least 2 parts in FORMAT entry: ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another; will always be heterozygous and is not intended to describe called alleles">
## Warning: Expected at least 2 parts in FORMAT entry: ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
## Warning: Expected at least 2 parts in FORMAT entry: ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
## Warning: Expected at least 2 parts in FORMAT entry: ID=RGQ,Number=1,Type=Integer,Description="Unconditional reference genotype confidence, encoded as a phred quality -10*log10 p(genotype call is wrong)">
## Warning: Expected at least 2 parts in INFO entry: ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
## After filtering, kept 76 out of 76 Individuals
## Outputting VCF file...
## After filtering, kept 6029 out of a possible 131140 Sites
## Run Time = 4.00 seconds
# juveniles
vcftools --gzvcf "../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered-true.vcf.gz" \
--max-missing 0.85 --recode --recode-INFO-all --out \
"../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-4vcfR"
##
## VCFtools - 0.1.17
## (C) Adam Auton and Anthony Marcketta 2009
##
## Parameters as interpreted:
## --gzvcf ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered-true.vcf.gz
## --recode-INFO-all
## --max-missing 0.85
## --out ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-4vcfR
## --recode
##
## Using zlib version: 1.2.11
## Warning: Expected at least 2 parts in FORMAT entry: ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another; will always be heterozygous and is not intended to describe called alleles">
## Warning: Expected at least 2 parts in FORMAT entry: ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
## Warning: Expected at least 2 parts in FORMAT entry: ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
## Warning: Expected at least 2 parts in FORMAT entry: ID=RGQ,Number=1,Type=Integer,Description="Unconditional reference genotype confidence, encoded as a phred quality -10*log10 p(genotype call is wrong)">
## Warning: Expected at least 2 parts in INFO entry: ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
## After filtering, kept 16 out of 16 Individuals
## Outputting VCF file...
## After filtering, kept 33681 out of a possible 51818 Sites
## Run Time = 2.00 seconds
require(vcfR)
# Load the data
vcf.adult <- read.vcfR("../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-4vcfR.recode.vcf", verbose = FALSE )
# Confirm order of samples in vcf matches order of samples in the "sample.info" dataframe
colnames(extract.gt(vcf.adult)) == sample.info.adult$sample #yes, all are TRUE
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# Calculate differentiation / diversity stats for all population + pCO2 groups
myDiff.adult <- genetic_diff(vcf.adult, pops = sample.info.adult$pop_code, method = 'nei')
paste("No. SNPs included in this vcfR analysis, adults ", nrow(myDiff.adult), sep="")
## [1] "No. SNPs included in this vcfR analysis, adults 45063"
# Summarize diversity indices (mean)
knitr::kable(round(colMeans(myDiff.adult[,-1:-2], na.rm = TRUE), digits = 3))
| x | |
|---|---|
| Hs_Dabob Bay-Ambient | 0.128 |
| Hs_Dabob Bay-High | 0.129 |
| Hs_Fidalgo Bay-Ambient | 0.123 |
| Hs_Fidalgo Bay-High | 0.121 |
| Hs_Oyster Bay C1-Ambient | 0.130 |
| Hs_Oyster Bay C1-High | 0.123 |
| Ht | 0.141 |
| n_Dabob Bay-Ambient | 17.007 |
| n_Dabob Bay-High | 14.722 |
| n_Fidalgo Bay-Ambient | 16.442 |
| n_Fidalgo Bay-High | 16.319 |
| n_Oyster Bay C1-Ambient | 17.279 |
| n_Oyster Bay C1-High | 16.834 |
| Gst | 0.087 |
| Htmax | 0.853 |
| Gstmax | 0.857 |
| Gprimest | 0.109 |
# Violin plots of Heterozygosity
dpf.adult <- melt(myDiff.adult[,c(3:8)], varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
mutate(population=substring(variable, 4)) %>% separate(population, into = c("population", "pCO2"), sep = "-") %>%
mutate(population=as.factor(population), pCO2=as.factor(pCO2))
## No id variables; using all as measure variables
ggplot(dpf.adult, aes(x=population:pCO2, y=Depth)) + geom_violin(aes(fill=population, alpha=pCO2), adjust = 1.2) + xlab("") + ylab("") + theme_bw() +
theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Heterozygosity, Adults \nNo. SNPs =", nrow(myDiff.adult))) +
scale_alpha_discrete(range = c(0.35, 0.9)) +
stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
geom_text(data=dpf.adult %>% group_by(population, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:pCO2, y=-0.05, label=round(mean, 2)), size=3) +
scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.
# Boxplots of allelic richness
dpf.n.adult <- melt(myDiff.adult[,c(10:15)], varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
mutate(population=substring(variable, 3)) %>% separate(population, into = c("population", "pCO2"), sep = "-") %>%
mutate(population=as.factor(population), pCO2=as.factor(pCO2))
## No id variables; using all as measure variables
ggplot(dpf.n.adult, aes(x=population:pCO2, y=Depth)) + geom_violin(aes(fill=population, alpha=pCO2), adjust = 1.2) + xlab("") + ylab("") + theme_bw() +
theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Allelic Richness, Adults \nNo. SNPs =", nrow(myDiff.adult))) +
scale_alpha_discrete(range = c(0.35, 0.9)) +
stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
geom_text(data=dpf.n.adult %>% group_by(population, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:pCO2, y=18.5, label=round(mean, 2)), size=3) +
scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.
# Load the data
vcf.larvae <- read.vcfR("../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-4vcfR.recode.vcf", verbose = FALSE )
# Write script to order sample.info dataframe to have same order as vcf
colnames(extract.gt(vcf.larvae)) == sample.info.larvae[match(colnames(extract.gt(vcf.larvae)), sample.info.larvae$sample),]$sample #yes, all are TRUE
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [76] TRUE
# Calculate differentiation / diversity stats for all population + pCO2 groups
myDiff.larvae <- genetic_diff(vcf.larvae, pops = sample.info.larvae[match(colnames(extract.gt(vcf.larvae)), sample.info.larvae$sample),]$pop_code, method = 'nei')
paste("No. SNPs included in this vcfR analysis, juveniles ", nrow(myDiff.larvae), sep="")
## [1] "No. SNPs included in this vcfR analysis, juveniles 6029"
# Summarize diversity indices (mean)
knitr::kable(round(colMeans(myDiff.larvae[,-1:-2], na.rm = TRUE), digits = 3))
| x | |
|---|---|
| Hs_Dabob Bay-Ambient | 0.081 |
| Hs_Dabob Bay-High | 0.082 |
| Hs_Fidalgo Bay-Ambient | 0.084 |
| Hs_Fidalgo Bay-High | 0.090 |
| Hs_Oyster Bay C1-Ambient | 0.068 |
| Hs_Oyster Bay C1-High | 0.069 |
| Hs_Oyster Bay C2-Ambient | 0.068 |
| Hs_Oyster Bay C2-High | 0.063 |
| Ht | 0.081 |
| n_Dabob Bay-Ambient | 9.268 |
| n_Dabob Bay-High | 9.608 |
| n_Fidalgo Bay-Ambient | 11.806 |
| n_Fidalgo Bay-High | 19.365 |
| n_Oyster Bay C1-Ambient | 27.795 |
| n_Oyster Bay C1-High | 28.190 |
| n_Oyster Bay C2-Ambient | 13.528 |
| n_Oyster Bay C2-High | 14.253 |
| Gst | 0.082 |
| Htmax | 0.862 |
| Gstmax | 0.916 |
| Gprimest | 0.092 |
# Violin plots of Heterozygosity
dpf.larvae <- melt(myDiff.larvae[,c(3:10)], varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
mutate(population=substring(variable, 4)) %>% separate(population, into = c("population", "pCO2"), sep = "-") %>%
mutate(population=as.factor(population), pCO2=as.factor(pCO2))
## No id variables; using all as measure variables
ggplot(dpf.larvae, aes(x=population:pCO2, y=Depth)) + geom_violin(aes(fill=population, alpha=pCO2), adjust = 1.2) + xlab("") + ylab("") + theme_bw() +
theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Heterozygosity, Larvae \nNo. SNPs =", nrow(myDiff.larvae))) +
scale_alpha_discrete(range = c(0.35, 0.9)) +
stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
geom_text(data=dpf.larvae %>% group_by(population, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:pCO2, y=-0.05, label=round(mean, 2)), size=3) +
scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.
# Boxplots of allelic richness
dpf.n.larvae <- melt(myDiff.larvae[,c(12:19)], varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
mutate(population=substring(variable, 3)) %>% separate(population, into = c("population", "pCO2"), sep = "-") %>%
mutate(population=as.factor(population), pCO2=as.factor(pCO2))
## No id variables; using all as measure variables
ggplot(dpf.n.larvae, aes(x=population:pCO2, y=Depth)) + geom_violin(aes(fill=population, alpha=pCO2), adjust = 1.2) + xlab("") + ylab("") + theme_bw() +
theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Allelic Richness, Larvae \nNo. SNPs =", nrow(myDiff.larvae))) +
scale_alpha_discrete(range = c(0.35, 0.9)) +
stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
geom_text(data=dpf.n.larvae %>% group_by(population, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:pCO2, y=3, label=round(mean, 2)), size=3) +
scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.
# 1-yr old “juveniles”
# Load the data
vcf.juvenile <- read.vcfR("../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-4vcfR.recode.vcf", verbose = FALSE )
# Confirm order of samples in vcf matches order of samples in the "sample.info" dataframe
colnames(extract.gt(vcf.juvenile)) == sample.info.juvenile$sample #yes, all are TRUE
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE
# Calculate differentiation / diversity stats for all population + pCO2 groups
myDiff.juvenile <- genetic_diff(vcf.juvenile, pops = sample.info.juvenile$pop_code, method = 'nei')
paste("No. SNPs included in this vcfR analysis, juveniles ", nrow(myDiff.juvenile), sep="")
## [1] "No. SNPs included in this vcfR analysis, juveniles 33681"
# Summarize diversity indices (mean)
knitr::kable(round(colMeans(myDiff.juvenile[,-1:-2], na.rm = TRUE), digits = 3))
| x | |
|---|---|
| Hs_Dabob Bay-Ambient | 0.189 |
| Hs_Dabob Bay-High | 0.197 |
| Hs_Fidalgo Bay-Ambient | 0.198 |
| Hs_Fidalgo Bay-High | 0.195 |
| Ht | 0.246 |
| n_Dabob Bay-Ambient | 7.637 |
| n_Dabob Bay-High | 7.838 |
| n_Fidalgo Bay-Ambient | 7.741 |
| n_Fidalgo Bay-High | 7.425 |
| Gst | 0.185 |
| Htmax | 0.797 |
| Gstmax | 0.762 |
| Gprimest | 0.251 |
# Violin plots of Heterozygosity
dpf.juvenile <- melt(myDiff.juvenile[,c(3:6)], varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
mutate(population=substring(variable, 4)) %>% separate(population, into = c("population", "pCO2"), sep = "-") %>%
mutate(population=as.factor(population), pCO2=as.factor(pCO2))
## No id variables; using all as measure variables
ggplot(dpf.juvenile, aes(x=population:pCO2, y=Depth)) + geom_violin(aes(fill=population, alpha=pCO2), adjust = 1.2) + xlab("") + ylab("") + theme_bw() +
theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Heterozygosity, Juveniles \nNo. SNPs =", nrow(myDiff.juvenile))) +
scale_alpha_discrete(range = c(0.35, 0.9)) +
stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
geom_text(data=dpf.juvenile %>% group_by(population, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:pCO2, y=-0.05, label=round(mean, 2)), size=3) +
scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.
# Boxplots of allelic richness
dpf.n.juvenile <- melt(myDiff.juvenile[,c(8:11)], varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
mutate(population=substring(variable, 3)) %>% separate(population, into = c("population", "pCO2"), sep = "-") %>%
mutate(population=as.factor(population), pCO2=as.factor(pCO2))
## No id variables; using all as measure variables
ggplot(dpf.n.juvenile, aes(x=population:pCO2, y=Depth)) + geom_violin(aes(fill=population, alpha=pCO2), adjust = 1.2) + xlab("") + ylab("") + theme_bw() +
theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Allelic Richness, Juveniles \nNo. SNPs =", nrow(myDiff.juvenile))) +
scale_alpha_discrete(range = c(0.35, 0.9)) +
stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
geom_text(data=dpf.n.juvenile %>% group_by(population, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:pCO2, y=8.2, label=round(mean, 2)), size=3) +
scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.
myDiff.AL <- inner_join(myDiff.adult, myDiff.larvae, by=c("CHROM", "POS")) #7,202 loci common among all
paste("No. SNPs shared among adults and larvae: ", nrow(myDiff.AL), sep="")
## [1] "No. SNPs shared among adults and larvae: 1397"
# Compare adults vs. larvae
knitr::kable(round(colMeans(inner_join(myDiff.adult, myDiff.larvae, by=c("CHROM", "POS"))[,-1:-2], na.rm = TRUE), digits = 3))
| x | |
|---|---|
| Hs_Dabob Bay-Ambient.x | 0.135 |
| Hs_Dabob Bay-High.x | 0.118 |
| Hs_Fidalgo Bay-Ambient.x | 0.111 |
| Hs_Fidalgo Bay-High.x | 0.117 |
| Hs_Oyster Bay C1-Ambient.x | 0.115 |
| Hs_Oyster Bay C1-High.x | 0.106 |
| Ht.x | 0.132 |
| n_Dabob Bay-Ambient.x | 17.311 |
| n_Dabob Bay-High.x | 14.454 |
| n_Fidalgo Bay-Ambient.x | 16.500 |
| n_Fidalgo Bay-High.x | 16.839 |
| n_Oyster Bay C1-Ambient.x | 17.270 |
| n_Oyster Bay C1-High.x | 17.092 |
| Gst.x | 0.094 |
| Htmax.x | 0.851 |
| Gstmax.x | 0.866 |
| Gprimest.x | 0.114 |
| Hs_Dabob Bay-Ambient.y | 0.111 |
| Hs_Dabob Bay-High.y | 0.111 |
| Hs_Fidalgo Bay-Ambient.y | 0.110 |
| Hs_Fidalgo Bay-High.y | 0.111 |
| Hs_Oyster Bay C1-Ambient.y | 0.096 |
| Hs_Oyster Bay C1-High.y | 0.095 |
| Hs_Oyster Bay C2-Ambient | 0.090 |
| Hs_Oyster Bay C2-High | 0.088 |
| Ht.y | 0.109 |
| n_Dabob Bay-Ambient.y | 9.244 |
| n_Dabob Bay-High.y | 9.618 |
| n_Fidalgo Bay-Ambient.y | 11.810 |
| n_Fidalgo Bay-High.y | 19.373 |
| n_Oyster Bay C1-Ambient.y | 27.993 |
| n_Oyster Bay C1-High.y | 28.306 |
| n_Oyster Bay C2-Ambient | 13.489 |
| n_Oyster Bay C2-High | 14.109 |
| Gst.y | 0.090 |
| Htmax.y | 0.865 |
| Gstmax.y | 0.888 |
| Gprimest.y | 0.103 |
# Heterozygosity
dpf.AL <-
melt(inner_join(myDiff.adult, myDiff.larvae, by=c("CHROM", "POS")) [,c(3:8,20:27)],
varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
mutate(variable=gsub("\\.x", ".adult", variable)) %>%
mutate(variable=gsub("\\.y", ".larvae", variable)) %>%
mutate(population=substring(variable, 4)) %>% separate(population, into = c("population", "pCO2", "stage"), sep = '[-.]') %>%
mutate_each_(funs(factor(.)),c("population", "pCO2", "stage"))
## Warning: `mutate_each_()` was deprecated in dplyr 0.7.0.
## Please use `across()` instead.
## No id variables; using all as measure variables
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 2794 rows [16765,
## 16766, 16767, 16768, 16769, 16770, 16771, 16772, 16773, 16774, 16775, 16776,
## 16777, 16778, 16779, 16780, 16781, 16782, 16783, 16784, ...].
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
ggplot(subset(dpf.AL, population!="Oyster Bay C2")%>%droplevels(), aes(x=population:stage:pCO2, y=Depth)) +
geom_violin(aes(fill=population, alpha=pCO2, col=stage), adjust = 1.2) + xlab("") + ylab("") + theme_bw() +
theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Heterozygosity, Adults & Larvae\nNo. shared SNPs =", nrow(myDiff.AL))) +
scale_alpha_discrete(range = c(0.35, 0.9)) +
stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
geom_text(data=dpf.AL %>% group_by(population, stage, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:stage:pCO2, y=-0.05, label=round(mean, 2)), size=3) +
scale_colour_manual(values=c("black", "dodgerblue3")) +
scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population', 'stage'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.
## Warning: Removed 2 rows containing missing values (geom_text).
# Allelic Richness
dpf.n.AL <- melt(inner_join(myDiff.adult, myDiff.larvae, by=c("CHROM", "POS")) [,c(10:15,29:36)],
varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
mutate(variable=gsub("\\.x", ".adult", variable)) %>%
mutate(variable=gsub("\\.y", ".larvae", variable)) %>%
mutate(population=substring(variable, 3)) %>% separate(population, into = c("population", "pCO2", "stage"), sep = '[-.]') %>%
mutate_each_(funs(factor(.)),c("population", "pCO2", "stage"))
## No id variables; using all as measure variables
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 2794 rows [16765,
## 16766, 16767, 16768, 16769, 16770, 16771, 16772, 16773, 16774, 16775, 16776,
## 16777, 16778, 16779, 16780, 16781, 16782, 16783, 16784, ...].
ggplot(subset(dpf.n.AL, population!="Oyster Bay C2")%>%droplevels(), aes(x=population:stage:pCO2, y=Depth)) +
geom_violin(aes(fill=population, alpha=pCO2, col=stage), width=1) + xlab("") + ylab("") + theme_bw() +
theme(axis.text.x = element_text(size=7, angle=50, hjust=1.3)) + ggtitle(paste("Allelic Richness, Adults & Larvae\nNo. shared SNPs =", nrow(myDiff.AL))) +
scale_alpha_discrete(range = c(0.35, 0.9)) +
stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
geom_text(data=dpf.n.AL %>% group_by(population, stage, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:stage:pCO2, y=2, label=round(mean, 2)), size=3) +
scale_colour_manual(values=c("black", "dodgerblue3")) +
scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population', 'stage'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.
## Warning: Removed 2 rows containing missing values (geom_text).
myDiff.AJ <- inner_join(myDiff.adult, myDiff.juvenile, by=c("CHROM", "POS"))
paste("No. SNPs shared among adults and juveniles: ", nrow(myDiff.AJ), sep="")
## [1] "No. SNPs shared among adults and juveniles: 4352"
# Compare adults vs. larvae
knitr::kable(round(colMeans(myDiff.AJ[,-1:-2], na.rm = TRUE), digits = 3))
| x | |
|---|---|
| Hs_Dabob Bay-Ambient.x | 0.203 |
| Hs_Dabob Bay-High.x | 0.198 |
| Hs_Fidalgo Bay-Ambient.x | 0.192 |
| Hs_Fidalgo Bay-High.x | 0.200 |
| Hs_Oyster Bay C1-Ambient | 0.192 |
| Hs_Oyster Bay C1-High | 0.184 |
| Ht.x | 0.219 |
| n_Dabob Bay-Ambient.x | 17.248 |
| n_Dabob Bay-High.x | 14.235 |
| n_Fidalgo Bay-Ambient.x | 16.196 |
| n_Fidalgo Bay-High.x | 16.919 |
| n_Oyster Bay C1-Ambient | 17.188 |
| n_Oyster Bay C1-High | 17.011 |
| Gst.x | 0.099 |
| Htmax.x | 0.864 |
| Gstmax.x | 0.779 |
| Gprimest.x | 0.135 |
| Hs_Dabob Bay-Ambient.y | 0.182 |
| Hs_Dabob Bay-High.y | 0.190 |
| Hs_Fidalgo Bay-Ambient.y | 0.200 |
| Hs_Fidalgo Bay-High.y | 0.188 |
| Ht.y | 0.243 |
| n_Dabob Bay-Ambient.y | 7.515 |
| n_Dabob Bay-High.y | 7.822 |
| n_Fidalgo Bay-Ambient.y | 7.842 |
| n_Fidalgo Bay-High.y | 7.394 |
| Gst.y | 0.193 |
| Htmax.y | 0.795 |
| Gstmax.y | 0.767 |
| Gprimest.y | 0.259 |
# Heterozygosity
dpf.AJ <- melt(select(myDiff.AJ,contains("Hs")),
varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
mutate(variable=gsub("\\.x", ".adult", variable)) %>%
mutate(variable=gsub("\\.y", ".juvenile", variable)) %>%
mutate(population=substring(variable, 4)) %>% separate(population, into = c("population", "pCO2", "stage"), sep = '[-.]') %>%
mutate_each_(funs(factor(.)),c("population", "pCO2", "stage"))
## No id variables; using all as measure variables
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 8704 rows [17409,
## 17410, 17411, 17412, 17413, 17414, 17415, 17416, 17417, 17418, 17419, 17420,
## 17421, 17422, 17423, 17424, 17425, 17426, 17427, 17428, ...].
ggplot(filter(dpf.AJ, !grepl('Oyster', population))%>%droplevels(), aes(x=population:stage:pCO2, y=Depth)) +
geom_violin(aes(fill=population, alpha=pCO2, col=stage), adjust = 1.2) + xlab("") + ylab("") + theme_bw() +
theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Heterozygosity, Adults & Juveniles\nNo. shared SNPs =", nrow(myDiff.AJ))) +
scale_alpha_discrete(range = c(0.35, 0.9)) +
stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
geom_text(data=filter(dpf.AJ, !grepl('Oyster', population))%>%droplevels() %>% group_by(population, stage, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:stage:pCO2, y=-0.05, label=round(mean, 2)), size=3) +
scale_colour_manual(values=c("black", "darkred")) +
scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population', 'stage'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.
# Allelic Richness
dpf.n.AJ <- melt(select(myDiff.AJ,contains("n_")),
varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
mutate(variable=gsub("\\.x", ".adult", variable)) %>%
mutate(variable=gsub("\\.y", ".juvenile", variable)) %>%
mutate(population=substring(variable, 3)) %>% separate(population, into = c("population", "pCO2", "stage"), sep = '[-.]') %>%
mutate_each_(funs(factor(.)),c("population", "pCO2", "stage"))
## No id variables; using all as measure variables
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 8704 rows [17409,
## 17410, 17411, 17412, 17413, 17414, 17415, 17416, 17417, 17418, 17419, 17420,
## 17421, 17422, 17423, 17424, 17425, 17426, 17427, 17428, ...].
ggplot(filter(dpf.n.AJ, !grepl('Oyster', population))%>%droplevels(), aes(x=population:stage:pCO2, y=Depth)) +
geom_violin(aes(fill=population, alpha=pCO2, col=stage), adjust = 1.2) + xlab("") + ylab("") + theme_bw() +
theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Heterozygosity, Adults & Juveniles\nNo. shared SNPs =", nrow(myDiff.AJ))) +
scale_alpha_discrete(range = c(0.35, 0.9)) +
stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
geom_text(data=filter(dpf.n.AJ, !grepl('Oyster', population))%>%droplevels() %>% group_by(population, stage, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:stage:pCO2, y=-0.05, label=round(mean, 2)), size=3) +
scale_colour_manual(values=c("black", "darkred")) +
scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population', 'stage'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.
myDiff.ALJ <- inner_join(inner_join(myDiff.adult, myDiff.larvae, by=c("CHROM", "POS")), myDiff.juvenile, by=c("CHROM", "POS"))
paste("No. SNPs shared among adults, larvae, and juveniles: ", nrow(myDiff.ALJ), sep="")
## [1] "No. SNPs shared among adults, larvae, and juveniles: 412"
# Compare adults vs. larvae
knitr::kable(round(colMeans(myDiff.ALJ[,-1:-2], na.rm = TRUE), digits = 3))
| x | |
|---|---|
| Hs_Dabob Bay-Ambient.x | 0.188 |
| Hs_Dabob Bay-High.x | 0.169 |
| Hs_Fidalgo Bay-Ambient.x | 0.142 |
| Hs_Fidalgo Bay-High.x | 0.150 |
| Hs_Oyster Bay C1-Ambient.x | 0.141 |
| Hs_Oyster Bay C1-High.x | 0.130 |
| Ht.x | 0.173 |
| n_Dabob Bay-Ambient.x | 17.398 |
| n_Dabob Bay-High.x | 14.296 |
| n_Fidalgo Bay-Ambient.x | 16.379 |
| n_Fidalgo Bay-High.x | 16.995 |
| n_Oyster Bay C1-Ambient.x | 17.194 |
| n_Oyster Bay C1-High.x | 17.121 |
| Gst.x | 0.102 |
| Htmax.x | 0.857 |
| Gstmax.x | 0.825 |
| Gprimest.x | 0.130 |
| Hs_Dabob Bay-Ambient.y | 0.150 |
| Hs_Dabob Bay-High.y | 0.154 |
| Hs_Fidalgo Bay-Ambient.y | 0.151 |
| Hs_Fidalgo Bay-High.y | 0.166 |
| Hs_Oyster Bay C1-Ambient.y | 0.117 |
| Hs_Oyster Bay C1-High.y | 0.115 |
| Hs_Oyster Bay C2-Ambient | 0.114 |
| Hs_Oyster Bay C2-High | 0.110 |
| Ht.y | 0.143 |
| n_Dabob Bay-Ambient.y | 9.194 |
| n_Dabob Bay-High.y | 9.476 |
| n_Fidalgo Bay-Ambient.y | 12.010 |
| n_Fidalgo Bay-High.y | 19.427 |
| n_Oyster Bay C1-Ambient.y | 27.607 |
| n_Oyster Bay C1-High.y | 28.364 |
| n_Oyster Bay C2-Ambient | 13.495 |
| n_Oyster Bay C2-High | 14.689 |
| Gst.y | 0.097 |
| Htmax.y | 0.870 |
| Gstmax.y | 0.853 |
| Gprimest.y | 0.115 |
| Hs_Dabob Bay-Ambient | 0.158 |
| Hs_Dabob Bay-High | 0.192 |
| Hs_Fidalgo Bay-Ambient | 0.170 |
| Hs_Fidalgo Bay-High | 0.150 |
| Ht | 0.214 |
| n_Dabob Bay-Ambient | 7.379 |
| n_Dabob Bay-High | 7.709 |
| n_Fidalgo Bay-Ambient | 7.854 |
| n_Fidalgo Bay-High | 7.345 |
| Gst | 0.193 |
| Htmax | 0.789 |
| Gstmax | 0.793 |
| Gprimest | 0.249 |
# Heterozygosity
dpf.ALJ <- melt(select(myDiff.ALJ,contains("Hs")) %>% rename_at(vars(`Hs_Dabob Bay-Ambient`:`Hs_Fidalgo Bay-High`), paste0, ".z"),
varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
mutate(variable=gsub("\\.x", ".adult", variable)) %>%
mutate(variable=gsub("\\.y", ".larvae", variable)) %>%
mutate(variable=gsub("\\.z", ".juvenile", variable)) %>%
mutate(population=substring(variable, 4)) %>% separate(population, into = c("population", "pCO2", "stage"), sep = '[-.]') %>%
mutate_each_(funs(factor(.)),c("population", "pCO2", "stage")) %>%
mutate(stage=fct_relevel(stage, c("adult", "larvae", "juvenile")))
## No id variables; using all as measure variables
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 824 rows [4945,
## 4946, 4947, 4948, 4949, 4950, 4951, 4952, 4953, 4954, 4955, 4956, 4957, 4958,
## 4959, 4960, 4961, 4962, 4963, 4964, ...].
ggplot(subset(dpf.ALJ, population!="Oyster Bay C2")%>% droplevels(), aes(x=population:stage:pCO2, y=Depth)) +
geom_violin(aes(fill=stage, alpha=pCO2), adjust = 1.2) + xlab("Parental pCO2 exposure") + ylab("") + theme_bw() +
theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Heterozygosity: Adults, Larvae, and Juveniles\nNo. shared SNPs =", nrow(myDiff.ALJ))) +
scale_alpha_discrete(range = c(0.35, 0.9)) +
stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
geom_text(data=subset(dpf.ALJ, population!="Oyster Bay C2")%>%droplevels() %>% group_by(population, stage, pCO2) %>% summarize(mean=mean(Depth)),
aes(x=population:stage:pCO2, y=-0.05, label=round(mean, 2)), size=3) +
scale_alpha_discrete(guide="none") +
annotate("text", x = 3, y = .6, label = "Dabob Bay", size=3.5) +
annotate("text", x = 9, y = .6, label = "Fidalgo Bay", size=3.5) +
annotate("text", x = 14.5, y = .6, label = "Oyster Bay", size=3.5) +
geom_vline(xintercept=6.5) + geom_vline(xintercept=12.5) +
scale_x_discrete(labels=rep(c("Ambient", "High"), times=8))
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population', 'stage'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.
# Allelic Richness
dpf.n.ALJ <- melt(select(myDiff.ALJ,contains("n_")) %>% rename_at(vars(`n_Dabob Bay-Ambient`:`n_Fidalgo Bay-High`), paste0, ".z"),
varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
mutate(variable=gsub("\\.x", ".adult", variable)) %>%
mutate(variable=gsub("\\.y", ".larvae", variable)) %>%
mutate(variable=gsub("\\.z", ".juvenile", variable)) %>%
mutate(population=substring(variable, 3)) %>% separate(population, into = c("population", "pCO2", "stage"), sep = '[-.]') %>%
mutate_each_(funs(factor(.)),c("population", "pCO2", "stage")) %>%
mutate(stage=fct_relevel(stage, c("adult", "larvae", "juvenile")))
## No id variables; using all as measure variables
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 824 rows [4945,
## 4946, 4947, 4948, 4949, 4950, 4951, 4952, 4953, 4954, 4955, 4956, 4957, 4958,
## 4959, 4960, 4961, 4962, 4963, 4964, ...].
ggplot(subset(dpf.n.ALJ, population!="Oyster Bay C2")%>%droplevels(), aes(x=population:stage:pCO2, y=Depth)) +
geom_violin(aes(fill=stage, alpha=pCO2),width=1.3) + xlab("Parental pCO2 exposure") + ylab("") + theme_bw() +
theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Allelic Richness: Adults, Larvae, and Juveniles\nNo. shared SNPs =", nrow(myDiff.ALJ))) +
scale_alpha_discrete(range = c(0.35, 0.9)) +
#stat_summary(fun.y=mean, geom="point", shape=15, size=2, color="black", fill="black") +
geom_text(data=subset(dpf.n.ALJ, population!="Oyster Bay C2")%>%droplevels() %>% group_by(population, stage, pCO2) %>% summarize(mean=mean(Depth)),
aes(x=population:stage:pCO2, y=3, label=round(mean, 2)), size=2.5) +
#scale_colour_manual(values=c("black", "dodgerblue3", "darkred")) +
scale_alpha_discrete(guide="none") + geom_vline(xintercept=6.5) + geom_vline(xintercept=12.5) +
annotate("text", x = 3, y = 30, label = "Dabob Bay", size=3.5) +
annotate("text", x = 9, y = 30, label = "Fidalgo Bay", size=3.5) +
annotate("text", x = 13.5, y = 30, label = "Oyster\nBay", size=3.5) +
scale_x_discrete(labels=rep(c("Ambient", "High"), times=8))
## Warning: Using alpha for a discrete variable is not advised.
## `summarise()` has grouped output by 'population', 'stage'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.
## Warning: position_dodge requires non-overlapping x intervals
I will use Colony to identify siblings in my samples. The Colony software is only available via the command line for Mac users (it’s typically used via GUI on Windows). To get the most current version of the program I actually had to email the developer. He send me a link to download it. I’m running Colony Version 2.0.6.6 (June 30, 2020), which is saved on my computer and in this project’s repo (references/colony2.mac.20200904).
Prepare files for Colony sibship analysis Colony identifies siblings, which I would like to do with my juveniles. Here I prepare files for Colony.
Mac suggested that I only keep SNPs with very high minor allele frequencies. Here I use SNPRelate to export SNPs with MAF >=2%.
snpset.adult.4Colony <- snpgdsLDpruning(genofile.adult, maf=.2, missing.rate=0, autosome.only=FALSE)
## SNP pruning based on LD:
## Excluding 174,997 SNPs (monomorphic: TRUE, MAF: 0.2, missing rate: 0)
## # of samples: 53
## # of SNPs: 246
## using 1 thread
## sliding window: 500,000 basepairs, Inf SNPs
## |LD| threshold: 0.2
## method: composite
## Chromosome Contig0_5313: 0.11%, 23/21,863
## Chromosome Contig104153_109739: 0.08%, 2/2,360
## Chromosome Contig109741_116186: 0.16%, 3/1,861
## Chromosome Contig11179_21262: 0.16%, 25/15,668
## Chromosome Contig116188_123130: 0.06%, 1/1,794
## Chromosome Contig123131_129982: 0.06%, 1/1,787
## Chromosome Contig129983_136896: 0.06%, 1/1,798
## Chromosome Contig136897_143777: 0.05%, 1/1,887
## Chromosome Contig150711_166372: 0.13%, 2/1,485
## Chromosome Contig166374_181896: 0.11%, 2/1,875
## Chromosome Contig181898_197428: 0.13%, 2/1,516
## Chromosome Contig21263_28135: 0.15%, 27/18,170
## Chromosome Contig213221_398012: 0.16%, 3/1,819
## Chromosome Contig28136_34745: 0.12%, 19/15,929
## Chromosome Contig34748_41067: 0.20%, 27/13,582
## Chromosome Contig398123_696856: 0.17%, 3/1,721
## Chromosome Contig41068_47193: 0.08%, 9/11,130
## Chromosome Contig47194_53180: 0.07%, 6/9,167
## Chromosome Contig5314_11178: 0.07%, 4/5,362
## Chromosome Contig53181_59083: 0.05%, 4/7,588
## Chromosome Contig59084_64831: 0.11%, 7/6,364
## Chromosome Contig64832_70524: 0.20%, 11/5,633
## Chromosome Contig70525_76135: 0.15%, 7/4,689
## Chromosome Contig76136_81765: 0.12%, 5/4,040
## Chromosome Contig81766_87352: 0.10%, 4/4,163
## Chromosome Contig87353_92929: 0.18%, 5/2,773
## Chromosome Contig92931_98526: 0.15%, 4/2,712
## Chromosome Contig98527_104152: 0.13%, 4/2,984
## 212 markers are selected in total.
snpset.adult.id.4Colony <- unlist(unname(snpset.adult.4Colony))
snpgdsCreateGenoSet(src.fn="../qc-processing/gatk/genotypes-adult.gds",
dest.fn="../qc-processing/colony/genotypes-adult-4Colony.gds",
snp.id=snpset.adult.id.4Colony)
## Create a GDS genotype file:
## The new dataset consists of 53 samples and 212 SNPs
## write sample.id
## write snp.id
## write snp.rs.id
## write snp.position
## write snp.chromosome
## write snp.allele
## SNP genotypes are stored in SNP-major mode (Sample X SNP).
SNPRelate codes alleles this way: “There are four possible values stored in the variable genotype: 0, 1, 2 and 3. For bi-allelic SNP sites, “0” indicates two B alleles, “1” indicates one A allele and one B allele, “2” indicates two A alleles, and “3” is a missing genotype. (When you) take out genotype data with sample and SNP IDs, and four possible values are returned 0, 1, 2 and NA (3 is replaced by NA).
Colony codes alleles this way: “The first column gives the individual ID (a string containing a maximum of 20 letters and/or numbers, no other characters are allowed), the 2nd and 3rd columns give the alleles observed for the individual at the first locus, the 4th and 5th give the alleles observed for the individual at the 2nd locus … An allele is identified by an integer, in the range of 1~999999999. If the locus is a dominant marker, then only one (instead of 2) column is required for the marker, and the value for the genotype should be either 1 (dominant phenotype, presence of a band) or 2 (recessive phenotype, absence of a band). Missing genotypes are indicated by 0 0 for a codominant marker and 0 for a dominant marker.” Also, there cannot be any NA values in the Colony genotype data.
I wanted to use the R program radiator to create a Colony formatted genotype file. But after a TON of attempts I wasn’t able to get my SNPRelate .gds data into radiator directly nor could I successfully import a .vcf file.
So, here I manually re-code the genotype matrix.
# Extract genotype matrix
geno.matrix4Colony.adult <- snpgdsGetGeno(gdsobj="../qc-processing/colony/genotypes-adult-4Colony.gds", with.id=TRUE)
## Genotype matrix: 53 samples X 212 SNPs
paste("No. of adult SNPs for Colony sibship analysis: ", ncol(geno.matrix4Colony.adult$genotype), sep="")
## [1] "No. of adult SNPs for Colony sibship analysis: 212"
paste("No. of adult samples for Colony sibship analysis: ", nrow(geno.matrix4Colony.adult$genotype), sep="")
## [1] "No. of adult samples for Colony sibship analysis: 53"
# Replace SNPRelate numeric allele coding with character allele coding
a <- geno.matrix4Colony.adult$genotype %>% as_data_frame() %>%
mutate_all(funs(str_replace(., "0", "BB"))) %>%
mutate_all(funs(str_replace(., "1", "AB"))) %>%
mutate_all(funs(str_replace(., "2", "AA")))
## Warning: `as_data_frame()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
# Duplicate each column. Now there's 2 columns per locus, both containing the same bi-allelic genotype info.
b <- a[rep(names(a), rep(2, times=ncol(a)))] %>% clean_names()
ncol(b) == ncol(a)*2 #should be TRUE
## [1] TRUE
# For each locus, extract the first allele in column 1, and the second allele in column 2.
c <- b #duplicate dataframe
odds <- seq(from = 1, to = ncol(c), by = 2)
evens <- seq(from = 2, to = ncol(c), by = 2)
odds.names <- colnames(b[odds])
evens.names <- colnames(b[evens])
for (i in 1:length(odds)) {
c[[odds.names[i]]] <- substring(b[[odds.names[i]]], first = 1, last = 1)
}
for (i in 1:length(evens)) {
c[[evens.names[i]]] <- substring(b[[evens.names[i]]], first = 2, last = 2)
}
# Now replace the character allele information with Colony's way of numeric coding.
# There are 2 columns for each locus (e.g. v1 and v1_2), and only 3 options for each allele:
# 1= B allele
# 2= A allele
# 0= missing allele
(geno.matrix4Colony.adult.recode <- c %>%
mutate_all(funs(str_replace(., "B", "1"))) %>%
mutate_all(funs(str_replace(., "A", "2"))) %>%
mutate(., across(everything(), ~replace_na(.x, 0))))
## # A tibble: 53 x 424
## v1 v1_2 v2 v2_2 v3 v3_2 v4 v4_2 v5 v5_2 v6 v6_2 v7
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2 1 2 2 2 1 2 2 2 1 1 1 2
## 2 2 2 2 2 1 1 1 1 2 1 1 1 2
## 3 2 2 2 1 2 1 2 1 2 2 2 2 2
## 4 2 2 2 1 2 1 1 1 1 1 2 2 2
## 5 2 2 1 1 2 2 2 1 2 1 2 2 1
## 6 2 2 2 1 2 1 2 1 2 1 1 1 2
## 7 2 2 2 2 2 1 2 1 2 2 2 2 2
## 8 2 2 1 1 2 1 1 1 1 1 1 1 2
## 9 2 2 1 1 2 1 1 1 1 1 1 1 2
## 10 2 1 2 1 2 1 1 1 2 1 2 2 2
## # … with 43 more rows, and 411 more variables: v7_2 <chr>, v8 <chr>,
## # v8_2 <chr>, v9 <chr>, v9_2 <chr>, v10 <chr>, v10_2 <chr>, v11 <chr>,
## # v11_2 <chr>, v12 <chr>, v12_2 <chr>, v13 <chr>, v13_2 <chr>, v14 <chr>,
## # v14_2 <chr>, v15 <chr>, v15_2 <chr>, v16 <chr>, v16_2 <chr>, v17 <chr>,
## # v17_2 <chr>, v18 <chr>, v18_2 <chr>, v19 <chr>, v19_2 <chr>, v20 <chr>,
## # v20_2 <chr>, v21 <chr>, v21_2 <chr>, v22 <chr>, v22_2 <chr>, v23 <chr>,
## # v23_2 <chr>, v24 <chr>, v24_2 <chr>, v25 <chr>, v25_2 <chr>, v26 <chr>,
## # v26_2 <chr>, v27 <chr>, v27_2 <chr>, v28 <chr>, v28_2 <chr>, v29 <chr>,
## # v29_2 <chr>, v30 <chr>, v30_2 <chr>, v31 <chr>, v31_2 <chr>, v32 <chr>,
## # v32_2 <chr>, v33 <chr>, v33_2 <chr>, v34 <chr>, v34_2 <chr>, v35 <chr>,
## # v35_2 <chr>, v36 <chr>, v36_2 <chr>, v37 <chr>, v37_2 <chr>, v38 <chr>,
## # v38_2 <chr>, v39 <chr>, v39_2 <chr>, v40 <chr>, v40_2 <chr>, v41 <chr>,
## # v41_2 <chr>, v42 <chr>, v42_2 <chr>, v43 <chr>, v43_2 <chr>, v44 <chr>,
## # v44_2 <chr>, v45 <chr>, v45_2 <chr>, v46 <chr>, v46_2 <chr>, v47 <chr>,
## # v47_2 <chr>, v48 <chr>, v48_2 <chr>, v49 <chr>, v49_2 <chr>, v50 <chr>,
## # v50_2 <chr>, v51 <chr>, v51_2 <chr>, v52 <chr>, v52_2 <chr>, v53 <chr>,
## # v53_2 <chr>, v54 <chr>, v54_2 <chr>, v55 <chr>, v55_2 <chr>, v56 <chr>,
## # v56_2 <chr>, v57 <chr>, …
# Save genotype matrix to file, while also adding a column with sample number
write_delim(cbind(sampleID=paste("O", geno.matrix4Colony.adult$sample.id, sep=""),
geno.matrix4Colony.adult.recode),
file="../qc-processing/colony/geno.matrix.adult.tab", delim=" ", col_names=FALSE)
I created a text file that will be the input for Colony. I used Colony’s example input file to do so.
Here’s a snapshot of the input file, truncating the genotype info (there are 53 samples, but here I only show 2)
head -n 28 ../qc-processing/colony/colony2_adults.dat
tail -n 12 ../qc-processing/colony/colony2_adults.dat
## 'adult_snps' !Dataset name
## 'adult_colony' !Output file name
## 53 ! Number of offspring in the sample
## 212 ! Number of loci
## 100 ! Seed for random number generator
## 0 ! 0/1=Not updating/updating allele frequency
## 2 ! 2/1=Dioecious/Monoecious species
## 0 ! 0/1=No inbreeding/inbreeding
## 0 ! 0/1=Diploid species/HaploDiploid species
## 0 0 ! 0/1=Polygamy/Monogamy for males & females
## 0 ! 0/1=Clone inference =No/Yes
## 0 ! 0/1=Full sibship size scaling =No/Yes
## 0 ! 0,1,2,3=No,weak,medium,strong sibship size prior; mean paternal & meteral sibship size
## 0 ! 0/1=Unknown/Known population allele frequency
## 1 ! Number of runs
## 2 ! Length of run
## 0 ! 0/1=Monitor method by Iterate#/Time in second
## 100000 ! Monitor interval in Iterate# / in seconds
## 0 ! non-Windows version
## 1 ! Fulllikelihood
## 3 ! 1/2/3=low/medium/high Precision for Fulllikelihood
##
## V@ !Marker names
## 0@ !Marker types, 0/1 = codominant/dominant
## 0.000@ !Allelic dropout rate
## 0.0001@ !false allele rate
##
## O291 2 1 2 2 2 1 2 2 2 1 1 1 2 2 1 1 2 2 2 2 2 2 2 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 2 1 2 2 2 2 2 2 1 1 1 1 2 1 2 1 2 2 2 1 2 2 2 2 1 1 2 1 2 2 2 2 2 1 2 2 1 1 2 2 2 1 2 1 2 2 2 2 2 1 2 2 2 2 2 2 1 1 2 1 1 1 1 1 2 2 2 1 2 2 2 2 2 2 2 1 2 1 2 2 2 1 2 2 1 1 2 2 2 2 2 1 2 1 2 2 2 2 2 2 2 1 2 2 2 1 2 2 2 1 2 2 2 1 2 1 2 1 2 2 2 2 2 1 2 1 1 1 2 2 2 2 2 2 2 2 2 2 2 1 1 1 2 1 2 1 2 2 2 2 2 1 1 1 2 1 2 2 2 2 2 1 2 1 2 2 2 2 1 1 2 1 2 1 2 2 1 1 1 1 1 1 2 2 2 2 2 2 2 1 1 1 2 1 2 1 2 1 2 2 2 1 2 1 2 1 2 1 2 2 2 2 2 1 2 1 2 2 2 2 2 1 2 2 1 1 1 1 2 2 2 1 1 1 2 2 2 2 2 2 1 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 2 1 2 2 2 2 2 2 2 1 2 1 2 2 1 1 2 2 1 1 2 1 1 1 2 1 2 2 2 2 2 2 2 1 2 1 2 1 2 2 2 2 2 2 2 2 2 1 2 2 2 1 1 1 2 1 2 2 2 2 2 1 1 1 2 1 2 2 2 1 2 2 2 1 2 2 2 1 2 1 2 2 2 1 2 2 2 1 2 1 2 1 2 2 2 2 2 1 2 1 1 1 2 2 2 1 2 2 2 2 2 1 2 2 2 1
## O349 2 1 2 1 1 1 2 2 2 2 2 2 2 2 2 2 1 1 2 1 2 2 2 1 2 1 2 1 2 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 2 2 2 2 1 1 1 1 2 2 2 2 2 2 2 2 1 1 2 2 2 2 1 1 2 1 1 1 2 2 2 1 1 1 2 2 2 1 2 1 2 2 2 1 2 2 1 1 2 1 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 1 1 1 1 1 1 2 1 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 2 2 2 1 2 2 2 1 1 1 2 2 2 1 2 1 1 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 1 1 2 1 2 2 2 2 2 2 2 2 2 2 2 1 2 1 2 2 2 1 2 2 1 1 2 2 2 1 2 1 2 2 1 1 2 1 2 2 2 2 2 1 2 2 2 2 1 1 2 1 2 1 2 2 2 1 2 2 1 1 2 1 1 1 1 1 2 2 2 1 2 1 2 2 1 1 2 1 2 2 2 2 2 2 2 2 1 1 2 2 2 2 1 1 1 1 2 2 2 2 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 1 2 1 2 2 1 1 2 2 2 2 2 1 2 1 2 1 2 2 2 2 2 1 2 2 2 1 2 2 1 1 2 2 2 2 2 2 2 1 2 2 1 1 2 2 2 2 2 1 2 1
##
## 0 0 !prob. of dad/mum included in the candidates
## 0 0 !numbers of candiadte males & females
## 0 0 !#known fater-offspring dyads, paternity exclusion threshold
## 0 0 !#known moter-offspring dyads, maternity exclusion threshold
## 0 !#known paternal sibship with unknown fathers
## 0 !#known maternal sibship with unknown mothers
## 0 !#known paternity exclusions
## 0 !#known maternity exclusions
## 0 !#known paternal sibship exclusions
## 0 !#known maternal sibship exclusions
Note: I have to be in the colony directory to run it, so I change my directory in the same code chunk prior to executing the ./colony2s.out script
cd ../qc-processing/colony/
./colony2s.out IFN:colony2_adults.dat
cat ../qc-processing/colony/adult_colony.BestCluster
## ClusterIndex Probability OffspringID FatherID MotherID
## 1 0.7953 O291 *1 #1
## 1 0.7953 O293 *3 #1
## 1 0.7953 O294 *3 #3
## 1 0.7953 O302 *1 #3
## 1 0.7953 O305 *1 #3
## 1 0.7953 O307 *1 #3
## 1 0.7953 O309 *1 #3
## 2 0.9974 O292 *2 #2
## 2 0.9974 O296 *2 #5
## 2 0.9974 O299 *2 #5
## 2 0.9974 O301 *2 #2
## 3 0.9199 O295 *4 #4
## 3 0.9199 O303 *4 #4
## 3 0.9199 O304 *4 #4
## 3 0.9199 O308 *4 #4
## 4 0.9864 O298 *5 #6
## 4 0.9864 O306 *6 #6
## 5 0.9859 O311 *7 #7
## 5 0.9859 O315 *7 #11
## 5 0.9859 O318 *7 #12
## 5 0.9859 O328 *7 #11
## 5 0.9859 O329 *7 #11
## 6 0.7866 O312 *8 #8
## 6 0.7866 O314 *8 #10
## 6 0.7866 O316 *8 #8
## 6 0.7866 O317 *8 #10
## 6 0.7866 O321 *8 #10
## 6 0.7866 O323 *8 #8
## 6 0.7866 O327 *8 #10
## 7 0.2345 O313 *9 #9
## 7 0.2345 O319 *10 #13
## 7 0.2345 O322 *9 #13
## 7 0.2345 O324 *9 #9
## 7 0.2345 O326 *9 #13
## 8 0.3050 O325 *11 #14
## 8 0.3050 O333 *14 #17
## 8 0.3050 O337 *14 #14
## 9 0.9090 O331 *12 #15
## 9 0.9090 O336 *12 #19
## 9 0.9090 O341 *17 #15
## 9 0.9090 O342 *12 #22
## 9 0.9090 O345 *19 #22
## 9 0.9090 O346 *19 #15
## 9 0.9090 O348 *12 #22
## 10 0.9796 O332 *13 #16
## 10 0.9796 O334 *15 #16
## 10 0.9796 O339 *13 #21
## 10 0.9796 O347 *13 #16
## 11 0.9952 O335 *16 #18
## 11 0.9952 O338 *16 #20
## 11 0.9952 O343 *16 #20
## 12 0.9870 O344 *18 #23
## 12 0.9870 O349 *18 #23
colony.clusters.adult <- read_table("../qc-processing/colony/adult_colony.BestCluster", col_names=TRUE) %>% clean_names() %>%
mutate_all(funs(str_replace(., "\\*|#", ""))) %>%
mutate(cluster_index=as.numeric(cluster_index), father_id=as.numeric(father_id), mother_id=as.numeric(mother_id), probability=as.numeric(probability)) %>%
mutate(offspring_id=str_replace(offspring_id, "O", "")) %>% as.data.frame() %>% left_join(sample.info.adult, by = c("offspring_id"="sample"))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## ClusterIndex = col_double(),
## Probability = col_double(),
## OffspringID = col_character(),
## FatherID = col_character(),
## MotherID = col_character()
## )
save(colony.clusters.adult, file = "../qc-processing/colony/best-cluster-adult")
# Plot mother ~ father ID
#ggplotly(
ggplot(colony.clusters.adult, aes(x=father_id, y=mother_id, size=probability, colour=population, shape=pCO2.parent)) + geom_jitter() + theme_minimal() + scale_shape_manual(values=c(1,2)) + scale_size_continuous(guide="none") #)
# How many fathers and mothers does each population + pCO2 group have?
colony.clusters.adult %>%
group_by(population, pCO2.parent) %>%
summarise(n_fathers=n_distinct(father_id),n_mothers=n_distinct(mother_id))
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## # A tibble: 6 x 4
## # Groups: population [3]
## population pCO2.parent n_fathers n_mothers
## <chr> <chr> <int> <int>
## 1 Dabob Bay Ambient 4 4
## 2 Dabob Bay High 5 6
## 3 Fidalgo Bay Ambient 6 5
## 4 Fidalgo Bay High 5 8
## 5 Oyster Bay C1 Ambient 4 6
## 6 Oyster Bay C1 High 4 7
# How many fathers and mothers does each population have
colony.clusters.adult %>%
group_by(population) %>%
summarise(n_fathers=n_distinct(father_id),n_mothers=n_distinct(mother_id))
## # A tibble: 3 x 3
## population n_fathers n_mothers
## <chr> <int> <int>
## 1 Dabob Bay 6 6
## 2 Fidalgo Bay 8 10
## 3 Oyster Bay C1 5 8
snpset.larvae.4Colony <- snpgdsLDpruning(genofile.larvae, maf=.2, missing.rate=0.15, autosome.only=FALSE)
## SNP pruning based on LD:
## Excluding 127,940 SNPs (monomorphic: TRUE, MAF: 0.2, missing rate: 0.15)
## # of samples: 76
## # of SNPs: 282
## using 1 thread
## sliding window: 500,000 basepairs, Inf SNPs
## |LD| threshold: 0.2
## method: composite
## Chromosome Contig0_5313: 0.13%, 20/15,947
## Chromosome Contig104153_109739: 0.25%, 4/1,625
## Chromosome Contig109741_116186: 0.15%, 2/1,328
## Chromosome Contig11179_21262: 0.21%, 26/12,305
## Chromosome Contig116188_123130: 0.50%, 6/1,207
## Chromosome Contig123131_129982: 0.27%, 3/1,099
## Chromosome Contig129983_136896: 0.33%, 4/1,222
## Chromosome Contig136897_143777: 0.23%, 3/1,321
## Chromosome Contig143780_150710: 0.25%, 3/1,179
## Chromosome Contig150711_166372: 0.19%, 2/1,031
## Chromosome Contig166374_181896: 0.09%, 1/1,077
## Chromosome Contig181898_197428: 0.09%, 1/1,122
## Chromosome Contig197431_213218: 0.18%, 2/1,107
## Chromosome Contig21263_28135: 0.18%, 24/13,677
## Chromosome Contig213221_398012: 0.17%, 2/1,191
## Chromosome Contig28136_34745: 0.17%, 22/12,701
## Chromosome Contig34748_41067: 0.14%, 15/10,383
## Chromosome Contig41068_47193: 0.16%, 13/7,913
## Chromosome Contig47194_53180: 0.13%, 9/6,968
## Chromosome Contig5314_11178: 0.14%, 5/3,691
## Chromosome Contig53181_59083: 0.19%, 11/5,690
## Chromosome Contig59084_64831: 0.23%, 10/4,432
## Chromosome Contig64832_70524: 0.30%, 12/4,031
## Chromosome Contig70525_76135: 0.32%, 11/3,438
## Chromosome Contig76136_81765: 0.07%, 2/2,871
## Chromosome Contig81766_87352: 0.11%, 3/2,843
## Chromosome Contig87353_92929: 0.31%, 6/1,952
## Chromosome Contig92931_98526: 0.42%, 8/1,900
## Chromosome Contig98527_104152: 0.10%, 2/1,979
## 232 markers are selected in total.
snpset.larvae.id.4Colony <- unlist(unname(snpset.larvae.4Colony))
snpgdsCreateGenoSet(src.fn="../qc-processing/gatk/genotypes-larvae.gds",
dest.fn="../qc-processing/colony/genotypes-larvae-4Colony.gds",
snp.id=snpset.larvae.id.4Colony)
## Create a GDS genotype file:
## The new dataset consists of 76 samples and 232 SNPs
## write sample.id
## write snp.id
## write snp.rs.id
## write snp.position
## write snp.chromosome
## write snp.allele
## SNP genotypes are stored in SNP-major mode (Sample X SNP).
# Extract genotype matrix
geno.matrix4Colony.larvae <- snpgdsGetGeno(gdsobj="../qc-processing/colony/genotypes-larvae-4Colony.gds", with.id=TRUE)
## Genotype matrix: 76 samples X 232 SNPs
paste("No. of larvae SNPs for Colony sibship analysis: ", ncol(geno.matrix4Colony.larvae$genotype), sep="")
## [1] "No. of larvae SNPs for Colony sibship analysis: 232"
paste("No. of larvae samples for Colony sibship analysis: ", nrow(geno.matrix4Colony.larvae$genotype), sep="")
## [1] "No. of larvae samples for Colony sibship analysis: 76"
# Replace SNPRelate numeric allele coding with character allele coding
a <- geno.matrix4Colony.larvae$genotype %>% as_data_frame() %>%
mutate_all(funs(str_replace(., "0", "BB"))) %>%
mutate_all(funs(str_replace(., "1", "AB"))) %>%
mutate_all(funs(str_replace(., "2", "AA")))
# Duplicate each column. Now there's 2 columns per locus, both containing the same bi-allelic genotype info.
b <- a[rep(names(a), rep(2, times=ncol(a)))] %>% clean_names()
ncol(b) == ncol(a)*2 #should be TRUE
## [1] TRUE
# For each locus, extract the first allele in column 1, and the second allele in column 2.
c <- b #duplicate dataframe
odds <- seq(from = 1, to = ncol(c), by = 2)
evens <- seq(from = 2, to = ncol(c), by = 2)
odds.names <- colnames(b[odds])
evens.names <- colnames(b[evens])
for (i in 1:length(odds.names)) {
c[[odds.names[i]]] <- substring(b[[odds.names[i]]], first = 1, last = 1)
}
for (i in 1:length(evens.names)) {
c[[evens.names[i]]] <- substring(b[[evens.names[i]]], first = 2, last = 2)
}
# Now replace the character allele information with Colony's way of numeric coding.
# There are 2 columns for each locus (e.g. v1 and v1_2), and only 3 options for each allele:
# 1= B allele
# 2= A allele
# 0= missing allele
(geno.matrix4Colony.larvae.recode <- c %>%
mutate_all(funs(str_replace(., "B", "1"))) %>%
mutate_all(funs(str_replace(., "A", "2"))) %>%
mutate(., across(everything(), ~replace_na(.x, 0))))
## # A tibble: 76 x 464
## v1 v1_2 v2 v2_2 v3 v3_2 v4 v4_2 v5 v5_2 v6 v6_2 v7
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2 2 2 1 2 2 0 0 2 2 2 2 2
## 2 0 0 2 2 1 1 0 0 0 0 0 0 2
## 3 0 0 2 2 0 0 0 0 2 2 2 1 0
## 4 2 2 2 1 2 2 2 2 2 2 0 0 2
## 5 1 1 2 1 2 2 1 1 2 2 2 2 2
## 6 0 0 2 2 2 2 1 1 2 2 2 2 2
## 7 0 0 0 0 2 2 0 0 0 0 0 0 2
## 8 0 0 0 0 0 0 0 0 2 2 2 2 2
## 9 2 1 2 1 2 2 2 2 2 2 2 2 0
## 10 2 1 2 1 2 2 2 2 1 1 2 2 2
## # … with 66 more rows, and 451 more variables: v7_2 <chr>, v8 <chr>,
## # v8_2 <chr>, v9 <chr>, v9_2 <chr>, v10 <chr>, v10_2 <chr>, v11 <chr>,
## # v11_2 <chr>, v12 <chr>, v12_2 <chr>, v13 <chr>, v13_2 <chr>, v14 <chr>,
## # v14_2 <chr>, v15 <chr>, v15_2 <chr>, v16 <chr>, v16_2 <chr>, v17 <chr>,
## # v17_2 <chr>, v18 <chr>, v18_2 <chr>, v19 <chr>, v19_2 <chr>, v20 <chr>,
## # v20_2 <chr>, v21 <chr>, v21_2 <chr>, v22 <chr>, v22_2 <chr>, v23 <chr>,
## # v23_2 <chr>, v24 <chr>, v24_2 <chr>, v25 <chr>, v25_2 <chr>, v26 <chr>,
## # v26_2 <chr>, v27 <chr>, v27_2 <chr>, v28 <chr>, v28_2 <chr>, v29 <chr>,
## # v29_2 <chr>, v30 <chr>, v30_2 <chr>, v31 <chr>, v31_2 <chr>, v32 <chr>,
## # v32_2 <chr>, v33 <chr>, v33_2 <chr>, v34 <chr>, v34_2 <chr>, v35 <chr>,
## # v35_2 <chr>, v36 <chr>, v36_2 <chr>, v37 <chr>, v37_2 <chr>, v38 <chr>,
## # v38_2 <chr>, v39 <chr>, v39_2 <chr>, v40 <chr>, v40_2 <chr>, v41 <chr>,
## # v41_2 <chr>, v42 <chr>, v42_2 <chr>, v43 <chr>, v43_2 <chr>, v44 <chr>,
## # v44_2 <chr>, v45 <chr>, v45_2 <chr>, v46 <chr>, v46_2 <chr>, v47 <chr>,
## # v47_2 <chr>, v48 <chr>, v48_2 <chr>, v49 <chr>, v49_2 <chr>, v50 <chr>,
## # v50_2 <chr>, v51 <chr>, v51_2 <chr>, v52 <chr>, v52_2 <chr>, v53 <chr>,
## # v53_2 <chr>, v54 <chr>, v54_2 <chr>, v55 <chr>, v55_2 <chr>, v56 <chr>,
## # v56_2 <chr>, v57 <chr>, …
# Save genotype matrix to file, while also adding a column with sample number
write_delim(cbind(sampleID=paste("O", geno.matrix4Colony.larvae$sample.id, sep=""),
geno.matrix4Colony.larvae.recode),
file="../qc-processing/colony/geno.matrix.larvae.tab", delim=" ", col_names=FALSE)
I created a text file that will be the input for Colony. I used Colony’s example input file as template and edited by hand. For the genotype matrix I selected all contents in the “geno.matrix.larvae.tab” file and pasted it into the text file.
Here’s a snapshot of the input file, truncating the genotype info (there are 76 samples, but here I only show 4)
head -n 30 ../qc-processing/colony/colony2_larvae.dat
tail -n 12 ../qc-processing/colony/colony2_larvae.dat
## 'larvae_snps' !Dataset name
## 'larvae_colony' !Output file name
## 76 ! Number of offspring in the sample
## 231 ! Number of loci
## 100 ! Seed for random number generator
## 0 ! 0/1=Not updating/updating allele frequency
## 2 ! 2/1=Dioecious/Monoecious species
## 0 ! 0/1=No inbreeding/inbreeding
## 0 ! 0/1=Diploid species/HaploDiploid species
## 0 0 ! 0/1=Polygamy/Monogamy for males & females
## 0 ! 0/1=Clone inference =No/Yes
## 0 ! 0/1=Full sibship size scaling =No/Yes
## 0 ! 0,1,2,3=No,weak,medium,strong sibship size prior; mean paternal & meteral sibship size
## 0 ! 0/1=Unknown/Known population allele frequency
## 1 ! Number of runs
## 2 ! Length of run
## 0 ! 0/1=Monitor method by Iterate#/Time in second
## 100000 ! Monitor interval in Iterate# / in seconds
## 0 ! non-Windows version
## 1 ! Fulllikelihood
## 3 ! 1/2/3=low/medium/high Precision for Fulllikelihood
##
## V@ !Marker names
## 0@ !Marker types, 0/1 = codominant/dominant
## 0.000@ !Allelic dropout rate
## 0.0001@ !false allele rate
##
## O034 2 2 2 1 2 2 0 0 2 2 2 2 2 1 2 1 2 2 2 2 0 0 0 0 1 1 2 2 0 0 2 1 2 2 2 2 1 1 2 1 2 2 1 1 2 2 2 1 2 2 2 2 2 1 2 2 1 1 0 0 2 1 2 2 2 2 2 2 2 2 2 1 2 1 2 2 2 2 2 2 2 1 2 1 0 0 2 2 2 2 2 2 2 2 2 2 0 0 2 2 2 2 2 1 2 1 2 1 1 1 0 0 2 1 2 2 0 0 2 2 1 1 1 1 1 1 2 1 2 1 2 2 2 2 2 2 2 2 1 1 2 2 0 0 2 2 2 2 1 1 2 1 0 0 2 2 2 1 0 0 2 2 2 2 1 1 2 2 2 2 2 1 0 0 2 2 0 0 2 1 1 1 2 2 2 2 2 2 2 2 0 0 2 2 2 2 0 0 2 2 2 2 2 2 1 1 2 1 2 2 0 0 1 1 2 2 2 1 2 2 2 2 2 2 2 2 0 0 0 0 2 2 0 0 2 1 2 2 2 2 2 1 2 1 2 2 2 2 2 1 2 2 0 0 1 1 2 1 2 1 2 2 2 1 1 1 0 0 2 1 0 0 2 2 2 2 2 1 0 0 0 0 0 0 2 2 2 2 0 0 2 2 2 2 2 2 0 0 2 1 2 1 2 2 0 0 2 1 2 1 1 1 2 2 2 2 2 2 1 1 0 0 2 2 0 0 2 2 2 2 2 2 0 0 2 1 2 1 2 1 1 1 2 1 0 0 2 2 2 1 2 2 2 2 2 2 2 2 0 0 2 2 2 1 2 2 1 1 2 1 2 2 2 2 1 1 1 1 1 1 1 1 2 2 2 2 0 0 2 2 0 0 2 2 2 1 0 0 2 1 0 0 2 2 2 2 2 2 2 2 2 2 1 1 2 1 2 1 2 1 2 1 2 2 2 2 2 2 0 0 0 0 2 2 2 1 2 2 2 2 0 0 2 2 2 1 2 1 2 1 2 2 0 0 2 2 1 1 2 1 0 0
## O035 0 0 2 2 1 1 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 2 2 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0 0 0 0 0 2 2 0 0 2 2 0 0 0 0 1 1 2 2 0 0 2 2 0 0 1 1 2 2 0 0 0 0 1 1 2 2 2 2 2 2 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 2 2 2 2 2 2 2 2 0 0 2 2 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 2 2 2 2 2 2 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 2 2 0 0 0 0 2 2 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 2 2 2 1 0 0 0 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 2 2 0 0 0 0 1 1 0 0 0 0 0 0 2 2
## O037 0 0 2 2 0 0 0 0 2 2 2 1 0 0 2 2 0 0 0 0 1 1 0 0 0 0 0 0 1 1 2 2 2 2 0 0 0 0 0 0 2 2 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 2 2 2 2 2 1 1 1 0 0 2 2 2 2 0 0 2 2 0 0 2 2 2 2 0 0 2 2 2 2 0 0 2 2 2 2 2 2 2 2 0 0 1 1 0 0 0 0 0 0 2 2 0 0 0 0 2 1 2 2 0 0 0 0 0 0 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 2 2 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 2 2 2 1 0 0 2 2 0 0 2 2 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 2 2 2 1 0 0 2 2 0 0 2 2 1 1 0 0 0 0 2 2 2 2 0 0 2 2 2 1 0 0 0 0 2 2 0 0 2 2 2 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 2 2 2 2 0 0 0 0 2 2 0 0 0 0 0 0 2 2 2 2 2 1 2 2 2 2 2 2 0 0 2 2 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 2 2 0 0 2 2 0 0 0 0 2 2 0 0 1 1 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 2 1 2 2 2 2 1 1 0 0 1 1 0 0 0 0 0 0 0 0 2 2 2 2 2 2 0 0 0 0 0 0 2 2 2 2 0 0 2 2 2 2 2 2 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0
## O565 2 2 2 2 0 0 0 0 2 2 0 0 2 2 2 2 0 0 2 2 0 0 2 2 2 1 2 2 2 2 0 0 2 2 2 2 0 0 2 2 0 0 0 0 0 0 2 2 0 0 2 2 2 2 2 2 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 1 0 0 0 0 2 2 2 2 0 0 0 0 2 2 2 2 0 0 0 0 2 2 2 2 2 2 0 0 1 1 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 2 2 1 1 0 0 0 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 1 1 0 0 2 2 2 2 0 0 2 1 0 0 0 0 2 2 2 2 2 2 2 2 2 2 1 1 2 1 2 2 2 1 2 2 0 0 0 0 2 2 2 2 2 2 0 0 0 0 2 2 1 1 0 0 1 1 0 0 2 2 2 2 0 0 0 0 2 2 2 2 2 2 0 0 0 0 1 1 1 1 2 2 2 2 0 0 2 2 2 2 0 0 0 0 2 2 2 2 0 0 2 2 2 2 2 2 0 0 0 0 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 2 1 2 2 0 0 2 2 0 0 2 2 2 2 0 0 2 2 2 2 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 2 2 2 2 2 2 0 0 2 2 2 2 1 1 2 2 2 2 0 0 2 2 2 2 0 0 2 2 0 0 0 0 2 2 0 0 0 0 0 0 0 0 1 1 0 0 2 2 2 2 0 0 2 1 0 0 0 0 0 0 0 0 2 2 0 0 0 0 2 2 0 0 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 0 2 2 2 2 0 0 0 0 2 2 0 0 2 2 2 1 2 1 0 0 2 2 1 1
##
## 0 0 !prob. of dad/mum included in the candidates
## 0 0 !numbers of candiadte males & females
## 0 0 !#known fater-offspring dyads, paternity exclusion threshold
## 0 0 !#known moter-offspring dyads, maternity exclusion threshold
## 0 !#known paternal sibship with unknown fathers
## 0 !#known maternal sibship with unknown mothers
## 0 !#known paternity exclusions
## 0 !#known maternity exclusions
## 0 !#known paternal sibship exclusions
## 0 !#known maternal sibship exclusions
Note: I have to be in the colony directory to run it, so I change my directory in the same code chunk prior to executing the ./colony2s.out script
cd ../qc-processing/colony/
./colony2s.out IFN:colony2_larvae.dat
cat ../qc-processing/colony/larvae_colony.BestCluster
## ClusterIndex Probability OffspringID FatherID MotherID
## 1 0.6778 O034 *1 #1
## 1 0.6778 O037 *3 #1
## 1 0.6778 O044 *3 #1
## 1 0.6778 O046 *7 #1
## 1 0.6778 O047 *7 #7
## 1 0.6778 O483 *3 #23
## 1 0.6778 O485 *23 #25
## 1 0.6778 O487 *23 #25
## 1 0.6778 O526 *7 #25
## 1 0.6778 O528 *23 #32
## 2 1.0000 O035 *2 #2
## 2 1.0000 O412 *2 #2
## 2 1.0000 O441 *2 #2
## 2 1.0000 O565 *2 #2
## 3 0.1356 O039 *4 #3
## 3 0.1356 O484 *22 #24
## 3 0.1356 O513 *4 #24
## 3 0.1356 O531 *22 #34
## 3 0.1356 O542 *33 #3
## 3 0.1356 O543 *22 #36
## 3 0.1356 O552 *4 #37
## 3 0.1356 O553 *34 #36
## 4 0.8784 O041 *5 #4
## 4 0.8784 O043 *5 #5
## 5 0.4926 O045 *6 #6
## 5 0.4926 O489 *24 #26
## 5 0.4926 O522 *27 #29
## 5 0.4926 O523 *28 #30
## 5 0.4926 O524 *28 #31
## 5 0.4926 O525 *28 #6
## 5 0.4926 O527 *29 #31
## 5 0.4926 O562 *27 #30
## 5 0.4926 O564 *24 #30
## 6 0.9721 O401 *8 #8
## 6 0.9721 O402 *9 #8
## 6 0.9721 O403 *10 #8
## 6 0.9721 O404 *11 #8
## 6 0.9721 O411 *12 #8
## 6 0.9721 O413 *13 #8
## 6 0.9721 O414 *8 #9
## 6 0.9721 O421 *14 #8
## 6 0.9721 O431 *15 #8
## 6 0.9721 O432 *8 #8
## 6 0.9721 O434 *15 #10
## 6 0.9721 O442 *16 #9
## 6 0.9721 O444 *18 #9
## 6 0.9721 O445 *18 #9
## 6 0.9721 O451 *18 #12
## 6 0.9721 O453 *16 #12
## 7 0.1504 O443 *17 #11
## 7 0.1504 O452 *17 #13
## 7 0.1504 O472 *17 #17
## 7 0.1504 O473 *17 #18
## 7 0.1504 O475 *17 #18
## 7 0.1504 O476 *17 #17
## 7 0.1504 O477 *17 #20
## 7 0.1504 O521 *26 #13
## 7 0.1504 O554 *26 #38
## 8 0.8628 O461 *19 #14
## 8 0.8628 O462 *19 #15
## 8 0.8628 O471 *19 #16
## 8 0.8628 O474 *19 #19
## 9 0.8967 O481 *20 #21
## 9 0.8967 O492 *20 #27
## 9 0.8967 O506 *20 #28
## 9 0.8967 O532 *31 #35
## 9 0.8967 O533 *32 #35
## 9 0.8967 O551 *31 #27
## 9 0.8967 O561 *35 #35
## 9 0.8967 O563 *31 #27
## 10 0.9372 O482 *21 #22
## 10 0.9372 O488 *21 #22
## 10 0.9372 O490 *25 #22
## 10 0.9372 O491 *25 #22
## 10 0.9372 O541 *21 #22
## 11 0.7313 O529 *30 #33
(colony.clusters.larvae <- read_table("../qc-processing/colony/larvae_colony.BestCluster", col_names=TRUE) %>% clean_names() %>%
mutate_all(funs(str_replace(., "\\*|#", ""))) %>%
mutate(cluster_index=as.numeric(cluster_index), father_id=as.numeric(father_id), mother_id=as.numeric(mother_id), probability=as.numeric(probability)) %>%
mutate(offspring_id=str_replace(offspring_id, "O", "")) %>% as.data.frame() %>% left_join(sample.info.larvae, by = c("offspring_id"="sample")))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## ClusterIndex = col_double(),
## Probability = col_double(),
## OffspringID = col_character(),
## FatherID = col_character(),
## MotherID = col_character()
## )
## cluster_index probability offspring_id father_id mother_id stage
## 1 1 0.6778 034 1 1 larvae
## 2 1 0.6778 037 3 1 larvae
## 3 1 0.6778 044 3 1 larvae
## 4 1 0.6778 046 7 1 larvae
## 5 1 0.6778 047 7 7 larvae
## 6 1 0.6778 483 3 23 larvae
## 7 1 0.6778 485 23 25 larvae
## 8 1 0.6778 487 23 25 larvae
## 9 1 0.6778 526 7 25 larvae
## 10 1 0.6778 528 23 32 larvae
## 11 2 1.0000 035 2 2 larvae
## 12 2 1.0000 412 2 2 larvae
## 13 2 1.0000 441 2 2 larvae
## 14 2 1.0000 565 2 2 larvae
## 15 3 0.1356 039 4 3 larvae
## 16 3 0.1356 484 22 24 larvae
## 17 3 0.1356 513 4 24 larvae
## 18 3 0.1356 531 22 34 larvae
## 19 3 0.1356 542 33 3 larvae
## 20 3 0.1356 543 22 36 larvae
## 21 3 0.1356 552 4 37 larvae
## 22 3 0.1356 553 34 36 larvae
## 23 4 0.8784 041 5 4 larvae
## 24 4 0.8784 043 5 5 larvae
## 25 5 0.4926 045 6 6 larvae
## 26 5 0.4926 489 24 26 larvae
## 27 5 0.4926 522 27 29 larvae
## 28 5 0.4926 523 28 30 larvae
## 29 5 0.4926 524 28 31 larvae
## 30 5 0.4926 525 28 6 larvae
## 31 5 0.4926 527 29 31 larvae
## 32 5 0.4926 562 27 30 larvae
## 33 5 0.4926 564 24 30 larvae
## 34 6 0.9721 401 8 8 larvae
## 35 6 0.9721 402 9 8 larvae
## 36 6 0.9721 403 10 8 larvae
## 37 6 0.9721 404 11 8 larvae
## 38 6 0.9721 411 12 8 larvae
## 39 6 0.9721 413 13 8 larvae
## 40 6 0.9721 414 8 9 larvae
## 41 6 0.9721 421 14 8 larvae
## 42 6 0.9721 431 15 8 larvae
## 43 6 0.9721 432 8 8 larvae
## 44 6 0.9721 434 15 10 larvae
## 45 6 0.9721 442 16 9 larvae
## 46 6 0.9721 444 18 9 larvae
## 47 6 0.9721 445 18 9 larvae
## 48 6 0.9721 451 18 12 larvae
## 49 6 0.9721 453 16 12 larvae
## 50 7 0.1504 443 17 11 larvae
## 51 7 0.1504 452 17 13 larvae
## 52 7 0.1504 472 17 17 larvae
## 53 7 0.1504 473 17 18 larvae
## 54 7 0.1504 475 17 18 larvae
## 55 7 0.1504 476 17 17 larvae
## 56 7 0.1504 477 17 20 larvae
## 57 7 0.1504 521 26 13 larvae
## 58 7 0.1504 554 26 38 larvae
## 59 8 0.8628 461 19 14 larvae
## 60 8 0.8628 462 19 15 larvae
## 61 8 0.8628 471 19 16 larvae
## 62 8 0.8628 474 19 19 larvae
## 63 9 0.8967 481 20 21 larvae
## 64 9 0.8967 492 20 27 larvae
## 65 9 0.8967 506 20 28 larvae
## 66 9 0.8967 532 31 35 larvae
## 67 9 0.8967 533 32 35 larvae
## 68 9 0.8967 551 31 27 larvae
## 69 9 0.8967 561 35 35 larvae
## 70 9 0.8967 563 31 27 larvae
## 71 10 0.9372 482 21 22 larvae
## 72 10 0.9372 488 21 22 larvae
## 73 10 0.9372 490 25 22 larvae
## 74 10 0.9372 491 25 22 larvae
## 75 10 0.9372 541 21 22 larvae
## 76 11 0.7313 529 30 33 larvae
## population pCO2.parent pop_code
## 1 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 2 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 3 Oyster Bay C1 High Oyster Bay C1-High
## 4 Oyster Bay C1 High Oyster Bay C1-High
## 5 Oyster Bay C1 High Oyster Bay C1-High
## 6 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 7 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 8 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 9 Oyster Bay C1 High Oyster Bay C1-High
## 10 Oyster Bay C1 High Oyster Bay C1-High
## 11 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 12 Dabob Bay High Dabob Bay-High
## 13 Fidalgo Bay Ambient Fidalgo Bay-Ambient
## 14 Oyster Bay C2 High Oyster Bay C2-High
## 15 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 16 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 17 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 18 Oyster Bay C2 Ambient Oyster Bay C2-Ambient
## 19 Oyster Bay C2 High Oyster Bay C2-High
## 20 Oyster Bay C2 High Oyster Bay C2-High
## 21 Oyster Bay C2 Ambient Oyster Bay C2-Ambient
## 22 Oyster Bay C2 Ambient Oyster Bay C2-Ambient
## 23 Oyster Bay C1 High Oyster Bay C1-High
## 24 Oyster Bay C1 High Oyster Bay C1-High
## 25 Oyster Bay C1 High Oyster Bay C1-High
## 26 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 27 Oyster Bay C1 High Oyster Bay C1-High
## 28 Oyster Bay C1 High Oyster Bay C1-High
## 29 Oyster Bay C1 High Oyster Bay C1-High
## 30 Oyster Bay C1 High Oyster Bay C1-High
## 31 Oyster Bay C1 High Oyster Bay C1-High
## 32 Oyster Bay C2 High Oyster Bay C2-High
## 33 Oyster Bay C2 High Oyster Bay C2-High
## 34 Dabob Bay Ambient Dabob Bay-Ambient
## 35 Dabob Bay Ambient Dabob Bay-Ambient
## 36 Dabob Bay Ambient Dabob Bay-Ambient
## 37 Dabob Bay Ambient Dabob Bay-Ambient
## 38 Dabob Bay High Dabob Bay-High
## 39 Dabob Bay High Dabob Bay-High
## 40 Dabob Bay High Dabob Bay-High
## 41 Dabob Bay Ambient Dabob Bay-Ambient
## 42 Dabob Bay High Dabob Bay-High
## 43 Dabob Bay High Dabob Bay-High
## 44 Dabob Bay High Dabob Bay-High
## 45 Fidalgo Bay Ambient Fidalgo Bay-Ambient
## 46 Fidalgo Bay Ambient Fidalgo Bay-Ambient
## 47 Fidalgo Bay Ambient Fidalgo Bay-Ambient
## 48 Fidalgo Bay High Fidalgo Bay-High
## 49 Fidalgo Bay High Fidalgo Bay-High
## 50 Fidalgo Bay Ambient Fidalgo Bay-Ambient
## 51 Fidalgo Bay High Fidalgo Bay-High
## 52 Fidalgo Bay High Fidalgo Bay-High
## 53 Fidalgo Bay High Fidalgo Bay-High
## 54 Fidalgo Bay High Fidalgo Bay-High
## 55 Fidalgo Bay High Fidalgo Bay-High
## 56 Fidalgo Bay High Fidalgo Bay-High
## 57 Oyster Bay C1 High Oyster Bay C1-High
## 58 Oyster Bay C2 Ambient Oyster Bay C2-Ambient
## 59 Fidalgo Bay Ambient Fidalgo Bay-Ambient
## 60 Fidalgo Bay Ambient Fidalgo Bay-Ambient
## 61 Fidalgo Bay High Fidalgo Bay-High
## 62 Fidalgo Bay High Fidalgo Bay-High
## 63 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 64 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 65 Oyster Bay C1 High Oyster Bay C1-High
## 66 Oyster Bay C2 Ambient Oyster Bay C2-Ambient
## 67 Oyster Bay C2 Ambient Oyster Bay C2-Ambient
## 68 Oyster Bay C2 Ambient Oyster Bay C2-Ambient
## 69 Oyster Bay C2 High Oyster Bay C2-High
## 70 Oyster Bay C2 High Oyster Bay C2-High
## 71 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 72 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 73 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 74 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 75 Oyster Bay C2 High Oyster Bay C2-High
## 76 Oyster Bay C1 High Oyster Bay C1-High
save(colony.clusters.larvae, file = "../qc-processing/colony/best-cluster-larvae")
# Plot mother ~ father ID
ggplotly(
ggplot(colony.clusters.larvae, aes(x=father_id, y=mother_id, size=probability, colour=population, shape=pCO2.parent)) + geom_jitter() + theme_minimal() + scale_shape_manual(values=c(1,2)) + scale_size_continuous(guide="none"))
# How many fathers and mothers does each population + pCO2 group have?
colony.clusters.larvae %>%
group_by(population, pCO2.parent) %>%
summarise(n_fathers=n_distinct(father_id),n_mothers=n_distinct(mother_id)) %>%
pivot_longer(names_to = "parent", values_to="n_parents", cols = c(n_fathers, n_mothers)) %>%
mutate(population=as.factor(population),pCO2.parent=as.factor(pCO2.parent),parent=as.factor(parent)) %>%
ggplot(aes(fill=population, alpha=pCO2.parent, x=population, y=n_parents)) + geom_bar(position="dodge", stat="identity") +
scale_alpha_discrete(range = c(0.35, 0.8)) + ylab("No. of parents") +
facet_wrap(~parent) + theme_minimal() + theme(axis.text.x = element_blank()) + xlab("Population & parental pCO2 exposure")
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
snpset.juvenile.4Colony <- snpgdsLDpruning(genofile.juvenile, maf=.4, missing.rate=0, autosome.only=FALSE)
## SNP pruning based on LD:
## Excluding 48,939 SNPs (monomorphic: TRUE, MAF: 0.4, missing rate: 0)
## # of samples: 16
## # of SNPs: 1,669
## using 1 thread
## sliding window: 500,000 basepairs, Inf SNPs
## |LD| threshold: 0.2
## method: composite
## Chromosome Contig0_5313: 1.73%, 99/5,731
## Chromosome Contig104153_109739: 1.60%, 12/750
## Chromosome Contig109741_116186: 2.09%, 14/670
## Chromosome Contig11179_21262: 1.76%, 79/4,498
## Chromosome Contig116188_123130: 2.41%, 14/580
## Chromosome Contig123131_129982: 1.42%, 6/422
## Chromosome Contig129983_136896: 1.49%, 8/536
## Chromosome Contig136897_143777: 1.95%, 13/667
## Chromosome Contig143780_150710: 2.06%, 10/486
## Chromosome Contig150711_166372: 2.20%, 10/455
## Chromosome Contig166374_181896: 2.36%, 13/550
## Chromosome Contig181898_197428: 2.36%, 10/423
## Chromosome Contig197431_213218: 2.13%, 12/564
## Chromosome Contig21263_28135: 1.88%, 94/4,994
## Chromosome Contig213221_398012: 2.16%, 11/510
## Chromosome Contig28136_34745: 1.68%, 80/4,772
## Chromosome Contig34748_41067: 1.83%, 70/3,817
## Chromosome Contig398123_696856: 1.89%, 9/476
## Chromosome Contig41068_47193: 1.72%, 55/3,201
## Chromosome Contig47194_53180: 2.20%, 62/2,819
## Chromosome Contig5314_11178: 1.77%, 25/1,414
## Chromosome Contig53181_59083: 1.89%, 48/2,534
## Chromosome Contig59084_64831: 2.01%, 36/1,792
## Chromosome Contig64832_70524: 1.81%, 30/1,654
## Chromosome Contig70525_76135: 2.10%, 28/1,331
## Chromosome Contig76136_81765: 1.81%, 24/1,329
## Chromosome Contig81766_87352: 2.08%, 25/1,203
## Chromosome Contig87353_92929: 1.50%, 13/866
## Chromosome Contig92931_98526: 2.15%, 17/791
## Chromosome Contig98527_104152: 2.07%, 16/773
## 943 markers are selected in total.
snpset.juvenile.id.4Colony <- unlist(unname(snpset.juvenile.4Colony))
snpgdsCreateGenoSet(src.fn="../qc-processing/gatk/genotypes-juvenile.gds",
dest.fn="../qc-processing/colony/genotypes-juvenile-4Colony.gds",
snp.id=snpset.juvenile.id.4Colony)
## Create a GDS genotype file:
## The new dataset consists of 16 samples and 943 SNPs
## write sample.id
## write snp.id
## write snp.rs.id
## write snp.position
## write snp.chromosome
## write snp.allele
## SNP genotypes are stored in SNP-major mode (Sample X SNP).
# Extract genotype matrix
geno.matrix4Colony.juvenile <- snpgdsGetGeno(gdsobj="../qc-processing/colony/genotypes-juvenile-4Colony.gds", with.id=TRUE)
## Genotype matrix: 16 samples X 943 SNPs
paste("No. of juvenile SNPs for Colony sibship analysis: ", ncol(geno.matrix4Colony.juvenile$genotype), sep="")
## [1] "No. of juvenile SNPs for Colony sibship analysis: 943"
paste("No. of juvenile samples for Colony sibship analysis: ", nrow(geno.matrix4Colony.juvenile$genotype), sep="")
## [1] "No. of juvenile samples for Colony sibship analysis: 16"
# Replace SNPRelate numeric allele coding with character allele coding
a <- geno.matrix4Colony.juvenile$genotype %>% as_data_frame() %>%
mutate_all(funs(str_replace(., "0", "BB"))) %>%
mutate_all(funs(str_replace(., "1", "AB"))) %>%
mutate_all(funs(str_replace(., "2", "AA")))
# Duplicate each column. Now there's 2 columns per locus, both containing the same bi-allelic genotype info.
b <- a[rep(names(a), rep(2, times=ncol(a)))] %>% clean_names()
ncol(b) == ncol(a)*2 #should be TRUE
## [1] TRUE
# For each locus, extract the first allele in column 1, and the second allele in column 2.
c <- b #duplicate dataframe
odds <- seq(from = 1, to = ncol(c), by = 2)
evens <- seq(from = 2, to = ncol(c), by = 2)
odds.names <- colnames(b[odds])
evens.names <- colnames(b[evens])
for (i in 1:length(odds.names)) {
c[[odds.names[i]]] <- substring(b[[odds.names[i]]], first = 1, last = 1)
}
for (i in 1:length(evens.names)) {
c[[evens.names[i]]] <- substring(b[[evens.names[i]]], first = 2, last = 2)
}
# Now replace the character allele information with Colony's way of numeric coding.
# There are 2 columns for each locus (e.g. v1 and v1_2), and only 3 options for each allele:
# 1= B allele
# 2= A allele
# 0= missing allele
(geno.matrix4Colony.juvenile.recode <- c %>%
mutate_all(funs(str_replace(., "B", "1"))) %>%
mutate_all(funs(str_replace(., "A", "2"))) %>%
mutate(., across(everything(), ~replace_na(.x, 0))))
## # A tibble: 16 x 1,886
## v1 v1_2 v2 v2_2 v3 v3_2 v4 v4_2 v5 v5_2 v6 v6_2 v7
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2 1 2 1 2 2 2 1 1 1 2 1 2
## 2 2 1 1 1 1 1 2 2 1 1 2 2 2
## 3 1 1 2 2 2 2 2 1 2 1 1 1 2
## 4 2 1 2 1 1 1 2 1 2 1 2 1 2
## 5 2 2 2 2 1 1 2 2 2 2 1 1 1
## 6 2 1 2 2 2 2 2 2 2 1 2 2 2
## 7 2 2 2 1 2 1 2 1 2 1 2 1 2
## 8 2 1 2 1 2 1 2 1 2 2 2 1 1
## 9 1 1 2 2 2 2 2 1 2 1 2 1 2
## 10 2 2 1 1 2 2 2 1 2 1 2 2 2
## 11 2 1 1 1 2 1 2 1 2 1 2 2 2
## 12 2 1 1 1 2 2 2 1 1 1 2 1 2
## 13 2 1 2 1 2 1 2 1 2 2 2 1 1
## 14 2 2 2 1 2 1 2 1 2 2 2 1 1
## 15 2 1 2 2 1 1 2 1 2 2 2 1 1
## 16 2 2 2 1 2 1 1 1 2 1 2 1 1
## # … with 1,873 more variables: v7_2 <chr>, v8 <chr>, v8_2 <chr>, v9 <chr>,
## # v9_2 <chr>, v10 <chr>, v10_2 <chr>, v11 <chr>, v11_2 <chr>, v12 <chr>,
## # v12_2 <chr>, v13 <chr>, v13_2 <chr>, v14 <chr>, v14_2 <chr>, v15 <chr>,
## # v15_2 <chr>, v16 <chr>, v16_2 <chr>, v17 <chr>, v17_2 <chr>, v18 <chr>,
## # v18_2 <chr>, v19 <chr>, v19_2 <chr>, v20 <chr>, v20_2 <chr>, v21 <chr>,
## # v21_2 <chr>, v22 <chr>, v22_2 <chr>, v23 <chr>, v23_2 <chr>, v24 <chr>,
## # v24_2 <chr>, v25 <chr>, v25_2 <chr>, v26 <chr>, v26_2 <chr>, v27 <chr>,
## # v27_2 <chr>, v28 <chr>, v28_2 <chr>, v29 <chr>, v29_2 <chr>, v30 <chr>,
## # v30_2 <chr>, v31 <chr>, v31_2 <chr>, v32 <chr>, v32_2 <chr>, v33 <chr>,
## # v33_2 <chr>, v34 <chr>, v34_2 <chr>, v35 <chr>, v35_2 <chr>, v36 <chr>,
## # v36_2 <chr>, v37 <chr>, v37_2 <chr>, v38 <chr>, v38_2 <chr>, v39 <chr>,
## # v39_2 <chr>, v40 <chr>, v40_2 <chr>, v41 <chr>, v41_2 <chr>, v42 <chr>,
## # v42_2 <chr>, v43 <chr>, v43_2 <chr>, v44 <chr>, v44_2 <chr>, v45 <chr>,
## # v45_2 <chr>, v46 <chr>, v46_2 <chr>, v47 <chr>, v47_2 <chr>, v48 <chr>,
## # v48_2 <chr>, v49 <chr>, v49_2 <chr>, v50 <chr>, v50_2 <chr>, v51 <chr>,
## # v51_2 <chr>, v52 <chr>, v52_2 <chr>, v53 <chr>, v53_2 <chr>, v54 <chr>,
## # v54_2 <chr>, v55 <chr>, v55_2 <chr>, v56 <chr>, v56_2 <chr>, v57 <chr>, …
# Save genotype matrix to file, while also adding a column with sample number
write_delim(cbind(sampleID=paste("O", geno.matrix4Colony.juvenile$sample.id, sep=""),
geno.matrix4Colony.juvenile.recode),
file="../qc-processing/colony/geno.matrix.juvenile.tab", delim=" ", col_names=FALSE)
I created a text file that will be the input for Colony. I used Colony’s example input file as template and edited by hand. For the genotype matrix I selected all contents in the “geno.matrix.juvenile.tab” file and pasted it into the text file.
Here’s a snapshot of the input file, truncating the genotype info (there are 16 samples, but here I only show 1)
head -n 28 ../qc-processing/colony/colony2_juveniles.dat
tail -n 12 ../qc-processing/colony/colony2_juveniles.dat
## 'juvenile_snps' !Dataset name
## 'juvenile_colony' !Output file name
## 16 ! Number of offspring in the sample
## 950 ! Number of loci
## 100 ! Seed for random number generator
## 0 ! 0/1=Not updating/updating allele frequency
## 2 ! 2/1=Dioecious/Monoecious species
## 0 ! 0/1=No inbreeding/inbreeding
## 0 ! 0/1=Diploid species/HaploDiploid species
## 0 0 ! 0/1=Polygamy/Monogamy for males & females
## 0 ! 0/1=Clone inference =No/Yes
## 0 ! 0/1=Full sibship size scaling =No/Yes
## 0 ! 0,1,2,3=No,weak,medium,strong sibship size prior; mean paternal & meteral sibship size
## 0 ! 0/1=Unknown/Known population allele frequency
## 1 ! Number of runs
## 2 ! Length of run
## 0 ! 0/1=Monitor method by Iterate#/Time in second
## 100000 ! Monitor interval in Iterate# / in seconds
## 0 ! non-Windows version
## 1 ! Fulllikelihood
## 3 ! 1/2/3=low/medium/high Precision for Fulllikelihood
##
## V@ !Marker names
## 0@ !Marker types, 0/1 = codominant/dominant
## 0.000@ !Allelic dropout rate
## 0.0001@ !false allele rate
##
## O137 2 1 2 1 2 2 2 1 1 1 2 1 2 1 1 1 2 2 2 1 1 1 2 1 2 1 2 1 1 1 2 1 2 2 2 2 2 1 2 2 2 1 1 1 2 2 2 1 1 1 1 1 1 1 2 1 2 1 2 1 2 1 1 1 2 2 2 1 1 1 2 1 2 2 2 2 2 1 2 1 1 1 2 1 2 2 2 2 2 1 1 1 2 2 1 1 2 1 2 1 2 1 2 1 1 1 2 1 2 1 2 1 1 1 2 1 2 1 2 1 1 1 1 1 2 2 2 1 2 1 2 2 1 1 2 1 1 1 2 1 2 1 2 2 2 1 2 2 2 1 2 1 2 2 2 1 1 1 2 2 1 1 2 1 2 1 2 1 1 1 2 2 2 1 2 1 2 1 2 1 2 2 2 2 2 1 1 1 2 1 1 1 1 1 2 1 2 1 2 1 1 1 1 1 2 1 1 1 2 1 2 1 2 1 2 1 2 2 1 1 2 1 1 1 2 1 1 1 1 1 2 1 2 1 2 1 1 1 2 1 2 1 2 1 2 1 2 2 2 1 2 2 1 1 2 2 2 1 2 1 2 1 1 1 2 1 2 1 1 1 2 2 2 1 2 1 2 1 2 1 2 1 1 1 2 1 1 1 2 1 2 1 2 1 2 2 2 1 2 1 2 1 2 1 2 1 2 1 2 2 2 1 2 2 2 1 1 1 2 2 2 1 2 1 2 1 2 1 2 1 2 2 2 1 2 1 2 1 1 1 2 1 2 1 1 1 2 1 1 1 2 2 2 1 2 1 2 1 1 1 1 1 1 1 2 2 2 2 1 1 2 1 1 1 2 1 2 1 2 1 2 1 2 1 1 1 2 1 2 1 2 2 2 2 2 1 1 1 2 1 2 2 2 2 2 1 1 1 2 2 1 1 2 1 2 2 2 1 2 1 2 1 2 1 1 1 1 1 2 1 2 1 1 1 2 2 2 1 2 1 2 1 2 1 2 2 2 1 2 1 2 1 2 1 2 1 2 1 1 1 2 1 2 1 2 1 2 1 2 1 2 2 1 1 1 1 2 1 1 1 2 1 2 1 2 2 1 1 1 1 2 1 2 1 2 1 1 1 2 1 1 1 2 2 2 2 2 1 1 1 2 1 1 1 2 1 2 1 2 1 2 2 2 1 2 1 2 1 2 1 2 2 2 2 2 1 2 2 2 2 2 1 2 1 2 2 2 1 2 1 2 1 1 1 2 1 2 1 2 1 1 1 2 2 1 1 2 1 2 1 1 1 2 1 2 2 2 2 2 2 2 2 2 1 2 1 2 1 2 1 2 2 2 2 2 1 1 1 2 2 1 1 1 1 1 1 2 2 2 1 2 1 2 2 2 1 1 1 1 1 1 1 2 1 2 2 1 1 1 1 2 2 2 1 2 2 2 1 2 2 1 1 1 1 2 1 2 1 2 1 2 1 1 1 2 2 2 1 2 1 2 2 2 1 2 1 2 2 2 1 2 1 2 2 2 1 2 1 2 2 1 1 1 1 2 1 2 1 2 1 2 1 2 1 2 1 1 1 2 2 2 2 2 2 2 1 1 1 2 1 2 1 2 1 2 2 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 2 1 1 2 1 2 2 2 1 2 1 2 1 2 2 2 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 2 1 2 2 1 1 2 2 2 1 2 2 2 1 2 1 2 1 2 2 1 1 1 1 2 2 2 1 2 1 2 2 2 2 1 1 2 1 2 1 2 1 2 2 2 2 1 1 2 1 2 1 2 2 1 1 2 2 2 1 2 2 2 1 2 1 1 1 2 2 2 1 2 1 2 1 2 2 1 1 1 1 2 2 2 1 2 1 1 1 2 1 2 1 1 1 2 1 2 1 2 2 2 1 2 1 2 1 2 1 2 1 2 1 2 1 1 1 2 1 2 1 2 2 1 1 2 2 2 1 2 1 2 2 1 1 2 1 2 2 2 1 2 1 2 1 2 1 2 1 2 2 1 1 2 2 2 1 2 1 1 1 2 2 1 1 2 1 2 1 2 1 2 1 2 2 2 2 2 2 2 1 2 2 2 1 2 1 2 2 1 1 1 1 2 1 1 1 2 1 1 1 2 1 2 1 2 1 1 1 2 2 2 1 2 1 2 1 2 2 2 2 2 1 2 1 2 1 2 2 2 1 2 2 2 2 2 1 2 2 2 1 2 1 2 1 2 1 1 1 2 2 2 2 2 1 2 1 1 1 2 1 2 2 1 1 2 1 2 2 2 2 2 2 1 1 2 2 2 1 1 1 2 1 1 1 1 1 2 2 2 1 1 1 2 1 1 1 2 1 2 2 2 1 2 1 2 1 2 1 2 2 2 1 2 2 1 1 2 1 1 1 1 1 1 1 2 1 1 1 2 1 2 2 1 1 2 2 2 1 2 1 2 1 1 1 2 1 2 1 2 1 2 1 1 1 2 1 2 1 2 2 2 1 2 2 2 1 2 1 2 1 2 1 2 1 1 1 2 1 2 1 2 1 1 1 2 2 2 1 2 1 2 2 2 1 2 1 2 1 2 1 2 1 2 1 1 1 2 2 2 1 2 2 2 1 2 2 2 2 2 1 2 1 1 1 2 2 2 1 1 1 2 1 2 1 2 2 1 1 2 2 2 1 1 1 2 1 1 1 2 2 2 1 1 1 2 2 1 1 2 1 1 1 2 2 1 1 2 1 1 1 2 1 2 2 1 1 2 1 1 1 2 1 1 1 2 2 2 2 2 1 1 1 2 2 2 1 2 2 1 1 2 2 2 1 2 1 2 2 1 1 2 1 1 1 2 1 2 1 2 1 1 1 2 1 2 1 2 2 2 2 1 1 2 1 2 2 2 1 2 1 2 1 1 1 2 2 2 1 2 1 1 1 2 1 2 2 2 1 2 1 2 1 1 1 2 1 2 2 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 2 2 2 2 2 2 1 1 1 1 1 2 1 1 1 2 2 1 1 2 1 2 1 2 1 2 1 1 1 2 1 2 1 2 1 1 1 2 2 2 1 2 1 2 2 1 1 1 1 1 1 2 1 1 1 2 1 2 2 2 2 1 1 2 1 2 1 2 1 2 2 2 2 2 1 1 1 1 1 1 1 1 1 2 1 2 2 2 1 1 1 2 2 2 2 2 2 2 2 2 1 2 1 2 1 2 1 2 1 2 2 1 1 2 1 1 1 1 1 2 1 2 2 2 1 1 1 1 1 2 2 1 1 2 1 2 1 2 2 2 1 2 2 2 1 2 1 2 1 2 1 1 1 2 1 2 2 2 2 1 1 2 1 2 1 2 2 2 1 2 1 1 1 1 1 2 2 2 1 2 2 2 1 1 1 2 2 2 2 2 1 2 2 2 2 1 1 2 2 2 2 2 1 2 1 2 2 2 1 2 1 2 1 2 1 2 2 2 1 2 1 2 1 2 2 2 1 2 1 1 1 2 2 1 1 2 1 2 1 2 1 2 1 2 1 1 1 2 1 2 1 2 1 1 1 2 1 1 1 2 2 2 2 2 2 2 1 2 2 2 1 2 1 2 1 1 1 2 1 2 1 2 1 2 2 2 2 2 2 2 2 2 2 2 1 2 2 1 1 2 1 2 1 2 2 1 1 2 1 1 1 1 1 2 2 1 1 1 1 2 2 2 1 1 1 2 1 2 1 2 1 2 1 1 1 2 1 1 1 2 1 2 1 1 1 2 2 2 2 2 2 2 1 1 1 2 2 2 2 1 1 2 1 2 2 2 1 2 1 2 1 2 1 1 1 1 1 2 1 2 2 2 2 2 1 1 1 2 1 2 1 2 1 2 1 1 1 2 1 1 1 2 1 1 1 2 1 2 1 2 1 2 1 2 1 1 1 2 2 2 2 2 1 2 2 1 1 2 1 1 1 1 1 1 1 2 1 1 1 2 2 1 1 1 1 2 1 2 1 2 2 2 1 2 1 1 1 2 1 2 1 2 2 2 1 2 2 2 1 2 1 2 2 2 2 2 1 2 2 2 2 2 1 2 1 2 1 2 1 2 1 2 1 1 1 2 2 2 1 2 1 2 1 2 1 1 1 2 2 2 1 2 1 2 1 1 1 2 1 1 1 2 1 2 2 1 1 2 1 2 1 2 1 2 2 2 2 2 1 2 2 1 1 2 1 2 1 2 1 2 1 2 1
##
##
## 0 0 !prob. of dad/mum included in the candidates
## 0 0 !numbers of candiadte males & females
## 0 0 !#known fater-offspring dyads, paternity exclusion threshold
## 0 0 !#known moter-offspring dyads, maternity exclusion threshold
## 0 !#known paternal sibship with unknown fathers
## 0 !#known maternal sibship with unknown mothers
## 0 !#known paternity exclusions
## 0 !#known maternity exclusions
## 0 !#known paternal sibship exclusions
## 0 !#known maternal sibship exclusions
Note: I have to be in the colony directory to run it, so I change my directory in the same code chunk prior to executing the ./colony2s.out script
cd ../qc-processing/colony/
./colony2s.out IFN:colony2_juveniles.dat
cat ../qc-processing/colony/juvenile_colony.BestCluster
## ClusterIndex Probability OffspringID FatherID MotherID
## 1 0.4954 O137 *1 #1
## 1 0.4954 O139 *2 #2
## 1 0.4954 O140 *2 #1
## 1 0.4954 O141 *1 #2
## 2 0.3684 O156 *3 #3
## 2 0.3684 O183 *10 #7
## 2 0.3684 O184 *10 #8
## 2 0.3684 O185 *3 #7
## 3 0.8991 O159 *4 #4
## 3 0.8991 O161 *5 #4
## 3 0.8991 O162 *5 #4
## 4 0.8583 O168 *6 #5
## 4 0.8583 O169 *7 #5
## 4 0.8583 O171 *7 #5
## 4 0.8583 O172 *8 #5
## 5 1.0000 O181 *9 #6
(colony.clusters.juvenile <- read_table("../qc-processing/colony/juvenile_colony.BestCluster", col_names=TRUE) %>% clean_names() %>%
mutate_all(funs(str_replace(., "\\*|#", ""))) %>%
mutate(cluster_index=as.numeric(cluster_index), father_id=as.numeric(father_id), mother_id=as.numeric(mother_id), probability=as.numeric(probability)) %>%
mutate(offspring_id=str_replace(offspring_id, "O", "")) %>% as.data.frame() %>% left_join(sample.info.juvenile, by = c("offspring_id"="sample")))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## ClusterIndex = col_double(),
## Probability = col_double(),
## OffspringID = col_character(),
## FatherID = col_character(),
## MotherID = col_character()
## )
## cluster_index probability offspring_id father_id mother_id stage
## 1 1 0.4954 137 1 1 juvenile
## 2 1 0.4954 139 2 2 juvenile
## 3 1 0.4954 140 2 1 juvenile
## 4 1 0.4954 141 1 2 juvenile
## 5 2 0.3684 156 3 3 juvenile
## 6 2 0.3684 183 10 7 juvenile
## 7 2 0.3684 184 10 8 juvenile
## 8 2 0.3684 185 3 7 juvenile
## 9 3 0.8991 159 4 4 juvenile
## 10 3 0.8991 161 5 4 juvenile
## 11 3 0.8991 162 5 4 juvenile
## 12 4 0.8583 168 6 5 juvenile
## 13 4 0.8583 169 7 5 juvenile
## 14 4 0.8583 171 7 5 juvenile
## 15 4 0.8583 172 8 5 juvenile
## 16 5 1.0000 181 9 6 juvenile
## population pCO2.parent pop_code
## 1 Dabob Bay Ambient Dabob Bay-Ambient
## 2 Dabob Bay Ambient Dabob Bay-Ambient
## 3 Dabob Bay Ambient Dabob Bay-Ambient
## 4 Dabob Bay Ambient Dabob Bay-Ambient
## 5 Fidalgo Bay Ambient Fidalgo Bay-Ambient
## 6 Fidalgo Bay High Fidalgo Bay-High
## 7 Fidalgo Bay High Fidalgo Bay-High
## 8 Fidalgo Bay High Fidalgo Bay-High
## 9 Fidalgo Bay Ambient Fidalgo Bay-Ambient
## 10 Fidalgo Bay Ambient Fidalgo Bay-Ambient
## 11 Fidalgo Bay Ambient Fidalgo Bay-Ambient
## 12 Dabob Bay High Dabob Bay-High
## 13 Dabob Bay High Dabob Bay-High
## 14 Dabob Bay High Dabob Bay-High
## 15 Dabob Bay High Dabob Bay-High
## 16 Fidalgo Bay High Fidalgo Bay-High
save(colony.clusters.juvenile, file = "../qc-processing/colony/best-cluster-juvenile")
# Plot mother ~ father ID
ggplotly(
ggplot(colony.clusters.juvenile, aes(x=father_id, y=mother_id, size=probability, colour=population, shape=pCO2.parent)) + geom_jitter() + theme_minimal() + scale_shape_manual(values=c(1,2)) + scale_size_continuous(guide="none"))
# How many fathers and mothers does each population + pCO2 group have?
colony.clusters.juvenile %>%
group_by(population, pCO2.parent) %>%
summarise(n_fathers=n_distinct(father_id),n_mothers=n_distinct(mother_id)) %>%
pivot_longer(names_to = "parent", values_to="n_parents", cols = c(n_fathers, n_mothers)) %>%
mutate(population=as.factor(population),pCO2.parent=as.factor(pCO2.parent),parent=as.factor(parent)) %>%
ggplot(aes(fill=population, alpha=pCO2.parent, x=population, y=n_parents)) + geom_bar(position="dodge", stat="identity") +
scale_alpha_discrete(range = c(0.35, 0.8)) + ylab("No. of parents") +
facet_wrap(~parent) + theme_minimal() + theme(axis.text.x = element_blank()) + xlab("Population & parental pCO2 exposure")
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
load("../results/tab.adult.annot")
load("../results/tab.larvae.annot")
load("../results/tab.juvenile.annot")
load(file="../results/PCA.data.adult")
load(file="../results/PCA.data.larvae")
load(file="../results/PCA.data.juvenile")
# Adults
PCA.merged.adult <- full_join(tab.adult.annot,PCA.data.adult, by=c("sample.id"="name", "population", "pCO2.parent")) %>% drop_na(group)
#pairs(PCA.merged.adult[c("EV1","EV2","EV3","EV4","PC1", "PC2")], col=c("red", "blue", "green", "orange", "purple", "black")[as.color(PCA.merged.adult$group)], main="Genetic ~ Expression PC Axes, ADULTS")
chart.Correlation(PCA.merged.adult[c("EV1","EV2","EV3","EV4","PC1", "PC2")], histogram=F, pch=19)
mtext("PC Axes Correlation, Genetic (EV) ~ Expression (PC), ADULTS", side=3, line=1.25)
ggscatter(PCA.merged.adult, x="PC2", y="EV4", add="reg.line",
add.params = list(color = "blue", fill = "lightgray"),
conf.int = TRUE) +
ggtitle("Genetic ~ Expression PC Axes, ADULTS") + stat_cor(method = "pearson", label.x = 08, label.y = .2)
## `geom_smooth()` using formula 'y ~ x'
ggscatter(PCA.merged.adult, x="PC2", y="EV4", add="reg.line", color="pCO2.parent",
palette = "blues", conf.int = TRUE) +
ggtitle("Genetic ~ Expression PC Axes, ADULTS") +
stat_cor(aes(color=pCO2.parent), method = "pearson", label.x = -20, size=2.5) + facet_wrap(~population) +
theme_minimal()
## `geom_smooth()` using formula 'y ~ x'
# Larvae
PCA.merged.larvae <- full_join(tab.larvae.annot,PCA.data.larvae, by=c("sample.id"="name", "population", "pCO2.parent")) %>% drop_na(group)
#pairs(PCA.merged.larvae[c("EV1","EV2","EV3","EV4","PC1", "PC2")], col=c("red", "blue", "green", "orange", "purple", "black")[as.color(PCA.merged.larvae$group)], main="Genetic ~ Expression PC Axes, LARVAE")
chart.Correlation(PCA.merged.larvae[c("EV1","EV2","EV3","EV4","PC1", "PC2")], histogram=F, pch=19)
mtext("PC Axes Correlation, Genetic (EV) ~ Expression (PC), LARVAE", side=3, line=1.25)
# Examine more closely the axes that are sign. correlated
# EV1 (genetic) ~ PC2 (expression)
ggscatter(PCA.merged.larvae, x="PC2", y="EV1", add="reg.line",
add.params = list(color = "blue", fill = "lightgray"),
conf.int = TRUE) +
ggtitle("Genetic ~ Expression PC Axes, LARVAE") + stat_cor(method = "pearson", label.x = 10, label.y = .2)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing non-finite values (stat_cor).
## Warning: Removed 3 rows containing missing values (geom_point).
ggscatter(PCA.merged.larvae, x="PC2", y="EV1", add="reg.line", color="pCO2.parent",
palette = "blues", conf.int = TRUE) +
ggtitle("Genetic ~ Expression PC Axes, LARVAE") +
stat_cor(aes(color=pCO2.parent), method = "pearson", label.x = 7, size=2.5) + facet_wrap(~population) +
theme_minimal()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing non-finite values (stat_cor).
## Warning: Removed 3 rows containing missing values (geom_point).
# EV1 (genetic) ~ PC2 (expression)
ggscatter(PCA.merged.larvae, x="PC1", y="EV2", add="reg.line",
add.params = list(color = "blue", fill = "lightgray"),
conf.int = TRUE) +
ggtitle("Genetic ~ Expression PC Axes, LARVAE") + stat_cor(method = "pearson", label.x = 10, label.y = .2)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing non-finite values (stat_cor).
## Warning: Removed 3 rows containing missing values (geom_point).
ggscatter(PCA.merged.larvae, x="PC1", y="EV2", add="reg.line", color="pCO2.parent",
palette = "blues", conf.int = TRUE) +
ggtitle("Genetic ~ Expression PC Axes, LARVAE") +
stat_cor(aes(color=pCO2.parent), method = "pearson", label.x = -10, size=2.5) + facet_wrap(~population) +
theme_minimal()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing non-finite values (stat_cor).
## Warning: Removed 3 rows containing missing values (geom_point).
# EV4 (genetic) ~ PC1 (expression)
ggscatter(PCA.merged.larvae, x="PC1", y="EV4", add="reg.line",
add.params = list(color = "blue", fill = "lightgray"),
conf.int = TRUE) +
ggtitle("Genetic ~ Expression PC Axes, LARVAE") + stat_cor(method = "pearson", label.x = 10, label.y = .2)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing non-finite values (stat_cor).
## Warning: Removed 3 rows containing missing values (geom_point).
ggscatter(PCA.merged.larvae, x="PC1", y="EV4", add="reg.line", color="pCO2.parent",
palette = "blues", conf.int = TRUE) +
ggtitle("Genetic ~ Expression PC Axes, LARVAE") +
stat_cor(aes(color=pCO2.parent), method = "pearson", label.x = -10, size=2.5) + facet_wrap(~population) +
theme_minimal()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing non-finite values (stat_cor).
## Warning: Removed 3 rows containing missing values (geom_point).
# Juveniles
PCA.merged.juvenile <- full_join(tab.juvenile.annot,PCA.data.juvenile, by=c("sample.id"="name", "population", "pCO2.parent")) %>% drop_na(group)
#pairs(PCA.merged.juvenile[c("EV1","EV2","EV3","EV4","PC1", "PC2")], col=c("red", "blue", "green", "orange", "purple", "black")[as.color(PCA.merged.juvenile$group)], main="Genetic ~ Expression PC Axes, JUVENILES")
chart.Correlation(PCA.merged.juvenile[c("EV1","EV2","EV3","EV4","PC1", "PC2")], histogram=F, pch=19)
mtext("PC Axes Correlation, Genetic (EV) ~ Expression (PC), JUVENILES", side=3, line=1.25)
# Examine more closely the axes that are sign. correlated
# EV1 (genetic) ~ PC2 (expression)
ggscatter(PCA.merged.juvenile, x="PC2", y="EV1", add="reg.line",
add.params = list(color = "blue", fill = "lightgray"),
conf.int = TRUE) +
ggtitle("Genetic ~ Expression PC Axes, JUVENILES") + stat_cor(method = "pearson", label.x = 10, label.y = .2)
## `geom_smooth()` using formula 'y ~ x'
ggscatter(PCA.merged.juvenile, x="PC2", y="EV1", add="reg.line", color="pCO2.parent",
palette = "blues", conf.int = TRUE) +
ggtitle("Genetic ~ Expression PC Axes, JUVENILES") +
stat_cor(aes(color=pCO2.parent), method = "pearson", label.x = 10, size=2.5) +
facet_wrap(~population) +
theme_minimal()
## `geom_smooth()` using formula 'y ~ x'